howard.objects.variants
1import csv 2import gc 3import gzip 4import io 5import multiprocessing 6import os 7import random 8import re 9import shlex 10import sqlite3 11import subprocess 12from tempfile import NamedTemporaryFile, TemporaryDirectory 13import tempfile 14import duckdb 15import json 16import yaml 17import argparse 18import Bio.bgzf as bgzf 19import pandas as pd 20from pyfaidx import Fasta 21import numpy as np 22import vcf 23import logging as log 24import fastparquet as fp 25from multiprocesspandas import applyparallel 26 27from howard.functions.commons import * 28from howard.objects.database import * 29from howard.functions.databases import * 30from howard.functions.utils import * 31 32 33class Variants: 34 35 def __init__( 36 self, 37 conn=None, 38 input: str = None, 39 output: str = None, 40 config: dict = {}, 41 param: dict = {}, 42 load: bool = False, 43 ) -> None: 44 """ 45 The function `__init__` initializes the variables, sets the input, output, config, param, connexion and 46 header 47 48 :param conn: the connection to the database 49 :param input: the input file 50 :param output: the output file 51 :param config: a dictionary containing the configuration of the model 52 :param param: a dictionary containing the parameters of the model 53 """ 54 55 # Init variables 56 self.init_variables() 57 58 # Input 59 self.set_input(input) 60 61 # Config 62 self.set_config(config) 63 64 # Param 65 self.set_param(param) 66 67 # Output 68 self.set_output(output) 69 70 # connexion 71 self.set_connexion(conn) 72 73 # Header 74 self.set_header() 75 76 # Load data 77 if load: 78 self.load_data() 79 80 def set_input(self, input: str = None) -> None: 81 """ 82 The function `set_input` takes a file name as input, extracts the name and extension, and sets 83 attributes in the class accordingly. 84 85 :param input: The `set_input` method in the provided code snippet is used to set attributes 86 related to the input file. Here's a breakdown of the parameters and their usage in the method: 87 :type input: str 88 """ 89 90 if input and not isinstance(input, str): 91 try: 92 self.input = input.name 93 except: 94 log.error(f"Input file '{input} in bad format") 95 raise ValueError(f"Input file '{input} in bad format") 96 else: 97 self.input = input 98 99 # Input format 100 if input: 101 input_name, input_extension = os.path.splitext(self.input) 102 self.input_name = input_name 103 self.input_extension = input_extension 104 self.input_format = self.input_extension.replace(".", "") 105 106 def set_config(self, config: dict) -> None: 107 """ 108 The set_config function takes a config object and assigns it as the configuration object for the 109 class. 110 111 :param config: The `config` parameter in the `set_config` function is a dictionary object that 112 contains configuration settings for the class. When you call the `set_config` function with a 113 dictionary object as the argument, it will set that dictionary as the configuration object for 114 the class 115 :type config: dict 116 """ 117 118 self.config = config 119 120 def set_param(self, param: dict) -> None: 121 """ 122 This function sets a parameter object for the class based on the input dictionary. 123 124 :param param: The `set_param` method you provided takes a dictionary object as input and sets it 125 as the `param` attribute of the class instance 126 :type param: dict 127 """ 128 129 self.param = param 130 131 def init_variables(self) -> None: 132 """ 133 This function initializes the variables that will be used in the rest of the class 134 """ 135 136 self.prefix = "howard" 137 self.table_variants = "variants" 138 self.dataframe = None 139 140 self.comparison_map = { 141 "gt": ">", 142 "gte": ">=", 143 "lt": "<", 144 "lte": "<=", 145 "equals": "=", 146 "contains": "SIMILAR TO", 147 } 148 149 self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3} 150 151 self.code_type_map_to_sql = { 152 "Integer": "INTEGER", 153 "String": "VARCHAR", 154 "Float": "FLOAT", 155 "Flag": "VARCHAR", 156 } 157 158 self.index_additionnal_fields = [] 159 160 def get_indexing(self) -> bool: 161 """ 162 It returns the value of the key "indexing" in the dictionary. If the key is not present, it 163 returns False. 164 :return: The value of the indexing parameter. 165 """ 166 167 return self.get_param().get("indexing", False) 168 169 def get_connexion_config(self) -> dict: 170 """ 171 The function `get_connexion_config` returns a dictionary containing the configuration for a 172 connection, including the number of threads and memory limit. 173 :return: a dictionary containing the configuration for the Connexion library. 174 """ 175 176 # config 177 config = self.get_config() 178 179 # Connexion config 180 connexion_config = {} 181 threads = self.get_threads() 182 183 # Threads 184 if threads: 185 connexion_config["threads"] = threads 186 187 # Memory 188 # if config.get("memory", None): 189 # connexion_config["memory_limit"] = config.get("memory") 190 if self.get_memory(): 191 connexion_config["memory_limit"] = self.get_memory() 192 193 # Temporary directory 194 if config.get("tmp", None): 195 connexion_config["temp_directory"] = config.get("tmp") 196 197 # Access 198 if config.get("access", None): 199 access = config.get("access") 200 if access in ["RO"]: 201 access = "READ_ONLY" 202 elif access in ["RW"]: 203 access = "READ_WRITE" 204 connexion_db = self.get_connexion_db() 205 if connexion_db in ":memory:": 206 access = "READ_WRITE" 207 connexion_config["access_mode"] = access 208 209 return connexion_config 210 211 def get_duckdb_settings(self) -> dict: 212 """ 213 The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a 214 string. 215 :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`. 216 """ 217 218 # config 219 config = self.get_config() 220 221 # duckdb settings 222 duckdb_settings_dict = {} 223 if config.get("duckdb_settings", None): 224 duckdb_settings = config.get("duckdb_settings") 225 duckdb_settings = full_path(duckdb_settings) 226 # duckdb setting is a file 227 if os.path.exists(duckdb_settings): 228 with open(duckdb_settings) as json_file: 229 duckdb_settings_dict = yaml.safe_load(json_file) 230 # duckdb settings is a string 231 else: 232 duckdb_settings_dict = json.loads(duckdb_settings) 233 234 return duckdb_settings_dict 235 236 def set_connexion_db(self) -> str: 237 """ 238 The function `set_connexion_db` returns the appropriate database connection string based on the 239 input format and connection type. 240 :return: the value of the variable `connexion_db`. 241 """ 242 243 # Default connexion db 244 default_connexion_db = ":memory:" 245 246 # Find connexion db 247 if self.get_input_format() in ["db", "duckdb"]: 248 connexion_db = self.get_input() 249 elif self.get_connexion_type() in ["memory", default_connexion_db, None]: 250 connexion_db = default_connexion_db 251 elif self.get_connexion_type() in ["tmpfile"]: 252 tmp_name = tempfile.mkdtemp( 253 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db" 254 ) 255 connexion_db = f"{tmp_name}/tmp.db" 256 elif self.get_connexion_type() != "": 257 connexion_db = self.get_connexion_type() 258 else: 259 connexion_db = default_connexion_db 260 261 # Set connexion db 262 self.connexion_db = connexion_db 263 264 return connexion_db 265 266 def set_connexion(self, conn) -> None: 267 """ 268 The function `set_connexion` creates a connection to a database, with options for different 269 database formats and settings. 270 271 :param conn: The `conn` parameter in the `set_connexion` method is the connection to the 272 database. If a connection is not provided, a new connection to an in-memory database is created. 273 The method then proceeds to set up the connection based on the specified format (e.g., duckdb or 274 sqlite 275 """ 276 277 # Connexion db 278 connexion_db = self.set_connexion_db() 279 280 # Connexion config 281 connexion_config = self.get_connexion_config() 282 283 # Connexion format 284 connexion_format = self.get_config().get("connexion_format", "duckdb") 285 # Set connexion format 286 self.connexion_format = connexion_format 287 288 # Connexion 289 if not conn: 290 if connexion_format in ["duckdb"]: 291 conn = duckdb.connect(connexion_db, config=connexion_config) 292 # duckDB settings 293 duckdb_settings = self.get_duckdb_settings() 294 if duckdb_settings: 295 for setting in duckdb_settings: 296 setting_value = duckdb_settings.get(setting) 297 if isinstance(setting_value, str): 298 setting_value = f"'{setting_value}'" 299 conn.execute(f"PRAGMA {setting}={setting_value};") 300 elif connexion_format in ["sqlite"]: 301 conn = sqlite3.connect(connexion_db) 302 303 # Set connexion 304 self.conn = conn 305 306 # Log 307 log.debug(f"connexion_format: {connexion_format}") 308 log.debug(f"connexion_db: {connexion_db}") 309 log.debug(f"connexion config: {connexion_config}") 310 log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}") 311 312 def set_output(self, output: str = None) -> None: 313 """ 314 The `set_output` function in Python sets the output file based on the input or a specified key 315 in the config file, extracting the output name, extension, and format. 316 317 :param output: The `output` parameter in the `set_output` method is used to specify the name of 318 the output file. If the config file has an 'output' key, the method sets the output to the value 319 of that key. If no output is provided, it sets the output to `None` 320 :type output: str 321 """ 322 323 if output and not isinstance(output, str): 324 self.output = output.name 325 else: 326 self.output = output 327 328 # Output format 329 if self.output: 330 output_name, output_extension = os.path.splitext(self.output) 331 self.output_name = output_name 332 self.output_extension = output_extension 333 self.output_format = self.output_extension.replace(".", "") 334 else: 335 self.output_name = None 336 self.output_extension = None 337 self.output_format = None 338 339 def set_header(self) -> None: 340 """ 341 It reads the header of a VCF file and stores it as a list of strings and as a VCF object 342 """ 343 344 input_file = self.get_input() 345 default_header_list = [ 346 "##fileformat=VCFv4.2", 347 "#CHROM POS ID REF ALT QUAL FILTER INFO", 348 ] 349 350 # Full path 351 input_file = full_path(input_file) 352 353 if input_file: 354 355 input_format = self.get_input_format() 356 input_compressed = self.get_input_compressed() 357 config = self.get_config() 358 header_list = default_header_list 359 if input_format in [ 360 "vcf", 361 "hdr", 362 "tsv", 363 "csv", 364 "psv", 365 "parquet", 366 "db", 367 "duckdb", 368 ]: 369 # header provided in param 370 if config.get("header_file", None): 371 with open(config.get("header_file"), "rt") as f: 372 header_list = self.read_vcf_header(f) 373 # within a vcf file format (header within input file itsself) 374 elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file): 375 # within a compressed vcf file format (.vcf.gz) 376 if input_compressed: 377 with bgzf.open(input_file, "rt") as f: 378 header_list = self.read_vcf_header(f) 379 # within an uncompressed vcf file format (.vcf) 380 else: 381 with open(input_file, "rt") as f: 382 header_list = self.read_vcf_header(f) 383 # header provided in default external file .hdr 384 elif os.path.exists((input_file + ".hdr")): 385 with open(input_file + ".hdr", "rt") as f: 386 header_list = self.read_vcf_header(f) 387 else: 388 try: # Try to get header info fields and file columns 389 390 with tempfile.TemporaryDirectory() as tmpdir: 391 392 # Create database 393 db_for_header = Database(database=input_file) 394 395 # Get header columns for infos fields 396 db_header_from_columns = ( 397 db_for_header.get_header_from_columns() 398 ) 399 400 # Get real columns in the file 401 db_header_columns = db_for_header.get_columns() 402 403 # Write header file 404 header_file_tmp = os.path.join(tmpdir, "header") 405 f = open(header_file_tmp, "w") 406 vcf.Writer(f, db_header_from_columns) 407 f.close() 408 409 # Replace #CHROM line with rel columns 410 header_list = db_for_header.read_header_file( 411 header_file=header_file_tmp 412 ) 413 header_list[-1] = "\t".join(db_header_columns) 414 415 except: 416 417 log.warning( 418 f"No header for file {input_file}. Set as default VCF header" 419 ) 420 header_list = default_header_list 421 422 else: # try for unknown format ? 423 424 log.error(f"Input file format '{input_format}' not available") 425 raise ValueError(f"Input file format '{input_format}' not available") 426 427 if not header_list: 428 header_list = default_header_list 429 430 # header as list 431 self.header_list = header_list 432 433 # header as VCF object 434 self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list))) 435 436 else: 437 438 self.header_list = None 439 self.header_vcf = None 440 441 def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame: 442 """ 443 The `get_query_to_df` function takes a query as a string and returns the result as a pandas 444 DataFrame based on the connection format. 445 446 :param query: The `query` parameter in the `get_query_to_df` function is a string that 447 represents the SQL query you want to execute. This query will be used to fetch data from a 448 database and convert it into a pandas DataFrame 449 :type query: str 450 :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the 451 maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the 452 function will only fetch up to that number of rows from the database query result. If no limit 453 is specified, 454 :type limit: int 455 :return: A pandas DataFrame is being returned by the `get_query_to_df` function. 456 """ 457 458 # Connexion format 459 connexion_format = self.get_connexion_format() 460 461 # Limit in query 462 if limit: 463 pd.set_option("display.max_rows", limit) 464 if connexion_format in ["duckdb"]: 465 df = ( 466 self.conn.execute(query) 467 .fetch_record_batch(limit) 468 .read_next_batch() 469 .to_pandas() 470 ) 471 elif connexion_format in ["sqlite"]: 472 df = next(pd.read_sql_query(query, self.conn, chunksize=limit)) 473 474 # Full query 475 else: 476 if connexion_format in ["duckdb"]: 477 df = self.conn.execute(query).df() 478 elif connexion_format in ["sqlite"]: 479 df = pd.read_sql_query(query, self.conn) 480 481 return df 482 483 def get_overview(self) -> None: 484 """ 485 The function prints the input, output, config, and dataframe of the current object 486 """ 487 table_variants_from = self.get_table_variants(clause="from") 488 sql_columns = self.get_header_columns_as_sql() 489 sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}" 490 df = self.get_query_to_df(sql_query_export) 491 log.info( 492 "Input: " 493 + str(self.get_input()) 494 + " [" 495 + str(str(self.get_input_format())) 496 + "]" 497 ) 498 log.info( 499 "Output: " 500 + str(self.get_output()) 501 + " [" 502 + str(str(self.get_output_format())) 503 + "]" 504 ) 505 log.info("Config: ") 506 for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split( 507 "\n" 508 ): 509 log.info("\t" + str(d)) 510 log.info("Param: ") 511 for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split( 512 "\n" 513 ): 514 log.info("\t" + str(d)) 515 log.info("Sample list: " + str(self.get_header_sample_list())) 516 log.info("Dataframe: ") 517 for d in str(df).split("\n"): 518 log.info("\t" + str(d)) 519 520 # garbage collector 521 del df 522 gc.collect() 523 524 return None 525 526 def get_stats(self) -> dict: 527 """ 528 The `get_stats` function calculates and returns various statistics of the current object, 529 including information about the input file, variants, samples, header fields, quality, and 530 SNVs/InDels. 531 :return: a dictionary containing various statistics of the current object. The dictionary has 532 the following structure: 533 """ 534 535 # Log 536 log.info(f"Stats Calculation...") 537 538 # table varaints 539 table_variants_from = self.get_table_variants() 540 541 # stats dict 542 stats = {"Infos": {}} 543 544 ### File 545 input_file = self.get_input() 546 stats["Infos"]["Input file"] = input_file 547 548 # Header 549 header_infos = self.get_header().infos 550 header_formats = self.get_header().formats 551 header_infos_list = list(header_infos) 552 header_formats_list = list(header_formats) 553 554 ### Variants 555 556 stats["Variants"] = {} 557 558 # Variants by chr 559 sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"' 560 df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom) 561 nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values( 562 by=["CHROM"], kind="quicksort" 563 ) 564 565 # Total number of variants 566 nb_of_variants = nb_of_variants_by_chrom["count"].sum() 567 568 # Calculate percentage 569 nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply( 570 lambda x: (x / nb_of_variants) 571 ) 572 573 stats["Variants"]["Number of variants by chromosome"] = ( 574 nb_of_variants_by_chrom.to_dict(orient="index") 575 ) 576 577 stats["Infos"]["Number of variants"] = int(nb_of_variants) 578 579 ### Samples 580 581 # Init 582 samples = {} 583 nb_of_samples = 0 584 585 # Check Samples 586 if "GT" in header_formats_list and "FORMAT" in self.get_header_columns(): 587 log.debug(f"Check samples...") 588 for sample in self.get_header_sample_list(): 589 sql_query_samples = f""" 590 SELECT '{sample}' as sample, 591 REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype, 592 count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count, 593 concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage 594 FROM {table_variants_from} 595 WHERE ( 596 regexp_matches("{sample}", '^[0-9]([/|][0-9])+') 597 AND 598 len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':')) 599 ) 600 GROUP BY genotype 601 """ 602 sql_query_genotype_df = self.conn.execute(sql_query_samples).df() 603 sample_genotype_count = sql_query_genotype_df["count"].sum() 604 if len(sql_query_genotype_df): 605 nb_of_samples += 1 606 samples[f"{sample} - {sample_genotype_count} variants"] = ( 607 sql_query_genotype_df.to_dict(orient="index") 608 ) 609 610 stats["Samples"] = samples 611 stats["Infos"]["Number of samples"] = nb_of_samples 612 613 # # 614 # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list: 615 # stats["Infos"]["Number of samples"] = nb_of_samples 616 # elif nb_of_samples: 617 # stats["Infos"]["Number of samples"] = "not a VCF format" 618 619 ### INFO and FORMAT fields 620 header_types_df = {} 621 header_types_list = { 622 "List of INFO fields": header_infos, 623 "List of FORMAT fields": header_formats, 624 } 625 i = 0 626 for header_type in header_types_list: 627 628 header_type_infos = header_types_list.get(header_type) 629 header_infos_dict = {} 630 631 for info in header_type_infos: 632 633 i += 1 634 header_infos_dict[i] = {} 635 636 # ID 637 header_infos_dict[i]["id"] = info 638 639 # num 640 genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"} 641 if header_type_infos[info].num in genotype_map.keys(): 642 header_infos_dict[i]["Number"] = genotype_map.get( 643 header_type_infos[info].num 644 ) 645 else: 646 header_infos_dict[i]["Number"] = header_type_infos[info].num 647 648 # type 649 if header_type_infos[info].type: 650 header_infos_dict[i]["Type"] = header_type_infos[info].type 651 else: 652 header_infos_dict[i]["Type"] = "." 653 654 # desc 655 if header_type_infos[info].desc != None: 656 header_infos_dict[i]["Description"] = header_type_infos[info].desc 657 else: 658 header_infos_dict[i]["Description"] = "" 659 660 if len(header_infos_dict): 661 header_types_df[header_type] = pd.DataFrame.from_dict( 662 header_infos_dict, orient="index" 663 ).to_dict(orient="index") 664 665 # Stats 666 stats["Infos"]["Number of INFO fields"] = len(header_infos_list) 667 stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list) 668 stats["Header"] = header_types_df 669 670 ### QUAL 671 if "QUAL" in self.get_header_columns(): 672 sql_query_qual = f""" 673 SELECT 674 avg(CAST(QUAL AS INTEGER)) AS Average, 675 min(CAST(QUAL AS INTEGER)) AS Minimum, 676 max(CAST(QUAL AS INTEGER)) AS Maximum, 677 stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation, 678 median(CAST(QUAL AS INTEGER)) AS Median, 679 variance(CAST(QUAL AS INTEGER)) AS Variance 680 FROM {table_variants_from} 681 WHERE CAST(QUAL AS VARCHAR) NOT IN ('.') 682 """ 683 684 qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index") 685 stats["Quality"] = {"Stats": qual} 686 687 ### SNV and InDel 688 689 sql_query_snv = f""" 690 691 SELECT Type, count FROM ( 692 693 SELECT 694 'Total' AS Type, 695 count(*) AS count 696 FROM {table_variants_from} 697 698 UNION 699 700 SELECT 701 'MNV' AS Type, 702 count(*) AS count 703 FROM {table_variants_from} 704 WHERE len(REF) > 1 AND len(ALT) > 1 705 AND len(REF) = len(ALT) 706 707 UNION 708 709 SELECT 710 'InDel' AS Type, 711 count(*) AS count 712 FROM {table_variants_from} 713 WHERE len(REF) > 1 OR len(ALT) > 1 714 AND len(REF) != len(ALT) 715 716 UNION 717 718 SELECT 719 'SNV' AS Type, 720 count(*) AS count 721 FROM {table_variants_from} 722 WHERE len(REF) = 1 AND len(ALT) = 1 723 724 ) 725 726 ORDER BY count DESC 727 728 """ 729 snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index") 730 731 sql_query_snv_substitution = f""" 732 SELECT 733 concat(REF, '>', ALT) AS 'Substitution', 734 count(*) AS count 735 FROM {table_variants_from} 736 WHERE len(REF) = 1 AND len(ALT) = 1 737 GROUP BY REF, ALT 738 ORDER BY count(*) DESC 739 """ 740 snv_substitution = ( 741 self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index") 742 ) 743 stats["Variants"]["Counts"] = snv_indel 744 stats["Variants"]["Substitutions"] = snv_substitution 745 746 return stats 747 748 def stats_to_file(self, file: str = None) -> str: 749 """ 750 The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them 751 into a JSON object, and writes the JSON object to the specified file. 752 753 :param file: The `file` parameter is a string that represents the file path where the JSON data 754 will be written 755 :type file: str 756 :return: the name of the file that was written to. 757 """ 758 759 # Get stats 760 stats = self.get_stats() 761 762 # Serializing json 763 json_object = json.dumps(stats, indent=4) 764 765 # Writing to sample.json 766 with open(file, "w") as outfile: 767 outfile.write(json_object) 768 769 return file 770 771 def print_stats(self, output_file: str = None, json_file: str = None) -> None: 772 """ 773 The `print_stats` function generates a markdown file and prints the statistics contained in a 774 JSON file in a formatted manner. 775 776 :param output_file: The `output_file` parameter is a string that specifies the path and filename 777 of the output file where the stats will be printed in Markdown format. If no `output_file` is 778 provided, a temporary directory will be created and the stats will be saved in a file named 779 "stats.md" within that 780 :type output_file: str 781 :param json_file: The `json_file` parameter is a string that represents the path to the JSON 782 file where the statistics will be saved. If no value is provided, a temporary directory will be 783 created and a default file name "stats.json" will be used 784 :type json_file: str 785 :return: The function `print_stats` does not return any value. It has a return type annotation 786 of `None`. 787 """ 788 789 # Full path 790 output_file = full_path(output_file) 791 json_file = full_path(json_file) 792 793 with tempfile.TemporaryDirectory() as tmpdir: 794 795 # Files 796 if not output_file: 797 output_file = os.path.join(tmpdir, "stats.md") 798 if not json_file: 799 json_file = os.path.join(tmpdir, "stats.json") 800 801 # Create folders 802 if not os.path.exists(os.path.dirname(output_file)): 803 Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True) 804 if not os.path.exists(os.path.dirname(json_file)): 805 Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True) 806 807 # Create stats JSON file 808 stats_file = self.stats_to_file(file=json_file) 809 810 # Print stats file 811 with open(stats_file) as f: 812 stats = yaml.safe_load(f) 813 814 # Output 815 output_title = [] 816 output_index = [] 817 output = [] 818 819 # Title 820 output_title.append("# HOWARD Stats") 821 822 # Index 823 output_index.append("## Index") 824 825 # Process sections 826 for section in stats: 827 infos = stats.get(section) 828 section_link = "#" + section.lower().replace(" ", "-") 829 output.append(f"## {section}") 830 output_index.append(f"- [{section}]({section_link})") 831 832 if len(infos): 833 for info in infos: 834 try: 835 df = pd.DataFrame.from_dict(infos.get(info), orient="index") 836 is_df = True 837 except: 838 try: 839 df = pd.DataFrame.from_dict( 840 json.loads((infos.get(info))), orient="index" 841 ) 842 is_df = True 843 except: 844 is_df = False 845 if is_df: 846 output.append(f"### {info}") 847 info_link = "#" + info.lower().replace(" ", "-") 848 output_index.append(f" - [{info}]({info_link})") 849 output.append(f"{df.to_markdown(index=False)}") 850 else: 851 output.append(f"- {info}: {infos.get(info)}") 852 else: 853 output.append(f"NA") 854 855 # Write stats in markdown file 856 with open(output_file, "w") as fp: 857 for item in output_title: 858 fp.write("%s\n" % item) 859 for item in output_index: 860 fp.write("%s\n" % item) 861 for item in output: 862 fp.write("%s\n" % item) 863 864 # Output stats in markdown 865 print("") 866 print("\n\n".join(output_title)) 867 print("") 868 print("\n\n".join(output)) 869 print("") 870 871 return None 872 873 def get_input(self) -> str: 874 """ 875 It returns the value of the input variable. 876 :return: The input is being returned. 877 """ 878 return self.input 879 880 def get_input_format(self, input_file: str = None) -> str: 881 """ 882 This function returns the format of the input variable, either from the provided input file or 883 by prompting for input. 884 885 :param input_file: The `input_file` parameter in the `get_input_format` method is a string that 886 represents the file path of the input file. If no `input_file` is provided when calling the 887 method, it will default to `None` 888 :type input_file: str 889 :return: The format of the input variable is being returned. 890 """ 891 892 if not input_file: 893 input_file = self.get_input() 894 input_format = get_file_format(input_file) 895 return input_format 896 897 def get_input_compressed(self, input_file: str = None) -> str: 898 """ 899 The function `get_input_compressed` returns the format of the input variable after compressing 900 it. 901 902 :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string 903 that represents the file path of the input file. If no `input_file` is provided when calling the 904 method, it will default to `None` and the method will then call `self.get_input()` to 905 :type input_file: str 906 :return: The function `get_input_compressed` returns the compressed format of the input 907 variable. 908 """ 909 910 if not input_file: 911 input_file = self.get_input() 912 input_compressed = get_file_compressed(input_file) 913 return input_compressed 914 915 def get_output(self) -> str: 916 """ 917 It returns the output of the neuron. 918 :return: The output of the neural network. 919 """ 920 921 return self.output 922 923 def get_output_format(self, output_file: str = None) -> str: 924 """ 925 The function `get_output_format` returns the format of the input variable or the output file if 926 provided. 927 928 :param output_file: The `output_file` parameter in the `get_output_format` method is a string 929 that represents the file path of the output file. If no `output_file` is provided when calling 930 the method, it will default to the output obtained from the `get_output` method of the class 931 instance. The 932 :type output_file: str 933 :return: The format of the input variable is being returned. 934 """ 935 936 if not output_file: 937 output_file = self.get_output() 938 output_format = get_file_format(output_file) 939 940 return output_format 941 942 def get_config(self) -> dict: 943 """ 944 It returns the config 945 :return: The config variable is being returned. 946 """ 947 return self.config 948 949 def get_param(self) -> dict: 950 """ 951 It returns the param 952 :return: The param variable is being returned. 953 """ 954 return self.param 955 956 def get_connexion_db(self) -> str: 957 """ 958 It returns the connexion_db attribute of the object 959 :return: The connexion_db is being returned. 960 """ 961 return self.connexion_db 962 963 def get_prefix(self) -> str: 964 """ 965 It returns the prefix of the object. 966 :return: The prefix is being returned. 967 """ 968 return self.prefix 969 970 def get_table_variants(self, clause: str = "select") -> str: 971 """ 972 This function returns the table_variants attribute of the object 973 974 :param clause: the type of clause the table will be used. Either "select" or "from" (optional), 975 defaults to select (optional) 976 :return: The table_variants attribute of the object. 977 """ 978 979 # Access 980 access = self.get_config().get("access", None) 981 982 # Clauses "select", "where", "update" 983 if clause in ["select", "where", "update"]: 984 table_variants = self.table_variants 985 # Clause "from" 986 elif clause in ["from"]: 987 # For Read Only 988 if self.get_input_format() in ["parquet"] and access in ["RO"]: 989 input_file = self.get_input() 990 table_variants = f"'{input_file}' as variants" 991 # For Read Write 992 else: 993 table_variants = f"{self.table_variants} as variants" 994 else: 995 table_variants = self.table_variants 996 return table_variants 997 998 def get_tmp_dir(self) -> str: 999 """ 1000 The function `get_tmp_dir` returns the temporary directory path based on configuration 1001 parameters or a default path. 1002 :return: The `get_tmp_dir` method is returning the temporary directory path based on the 1003 configuration, parameters, and a default value of "/tmp". 1004 """ 1005 1006 return get_tmp( 1007 config=self.get_config(), param=self.get_param(), default_tmp="/tmp" 1008 ) 1009 1010 def get_connexion_type(self) -> str: 1011 """ 1012 If the connexion type is not in the list of allowed connexion types, raise a ValueError 1013 1014 :return: The connexion type is being returned. 1015 """ 1016 return self.get_config().get("connexion_type", "memory") 1017 1018 def get_connexion(self): 1019 """ 1020 It returns the connection object 1021 1022 :return: The connection object. 1023 """ 1024 return self.conn 1025 1026 def close_connexion(self) -> None: 1027 """ 1028 This function closes the connection to the database. 1029 :return: The connection is being closed. 1030 """ 1031 return self.conn.close() 1032 1033 def get_header(self, type: str = "vcf"): 1034 """ 1035 This function returns the header of the VCF file as a list of strings 1036 1037 :param type: the type of header you want to get, defaults to vcf (optional) 1038 :return: The header of the vcf file. 1039 """ 1040 1041 if self.header_vcf: 1042 if type == "vcf": 1043 return self.header_vcf 1044 elif type == "list": 1045 return self.header_list 1046 else: 1047 if type == "vcf": 1048 header = vcf.Reader(io.StringIO("\n".join(vcf_required))) 1049 return header 1050 elif type == "list": 1051 return vcf_required 1052 1053 def get_header_length(self, file: str = None) -> int: 1054 """ 1055 The function `get_header_length` returns the length of the header list, excluding the #CHROM 1056 line. 1057 1058 :param file: The `file` parameter is an optional argument that specifies the path to a VCF 1059 header file. If this argument is provided, the function will read the header from the specified 1060 file and return the length of the header list minus 1 (to exclude the #CHROM line) 1061 :type file: str 1062 :return: the length of the header list, excluding the #CHROM line. 1063 """ 1064 1065 if file: 1066 return len(self.read_vcf_header_file(file=file)) - 1 1067 elif self.get_header(type="list"): 1068 return len(self.get_header(type="list")) - 1 1069 else: 1070 return 0 1071 1072 def get_header_columns(self) -> str: 1073 """ 1074 This function returns the header list of a VCF 1075 1076 :return: The length of the header list. 1077 """ 1078 if self.get_header(): 1079 return self.get_header(type="list")[-1] 1080 else: 1081 return "" 1082 1083 def get_header_columns_as_list(self) -> list: 1084 """ 1085 This function returns the header list of a VCF 1086 1087 :return: The length of the header list. 1088 """ 1089 if self.get_header(): 1090 return self.get_header_columns().strip().split("\t") 1091 else: 1092 return [] 1093 1094 def get_header_columns_as_sql(self) -> str: 1095 """ 1096 This function retruns header length (without #CHROM line) 1097 1098 :return: The length of the header list. 1099 """ 1100 sql_column_list = [] 1101 for col in self.get_header_columns_as_list(): 1102 sql_column_list.append(f'"{col}"') 1103 return ",".join(sql_column_list) 1104 1105 def get_header_sample_list(self) -> list: 1106 """ 1107 This function retruns header length (without #CHROM line) 1108 1109 :return: The length of the header list. 1110 """ 1111 return self.header_vcf.samples 1112 1113 def get_verbose(self) -> bool: 1114 """ 1115 It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't 1116 exist 1117 1118 :return: The value of the key "verbose" in the config dictionary. 1119 """ 1120 return self.get_config().get("verbose", False) 1121 1122 def get_connexion_format(self) -> str: 1123 """ 1124 It returns the connexion format of the object. 1125 :return: The connexion_format is being returned. 1126 """ 1127 connexion_format = self.connexion_format 1128 if connexion_format not in ["duckdb", "sqlite"]: 1129 log.error(f"Unknown connexion format {connexion_format}") 1130 raise ValueError(f"Unknown connexion format {connexion_format}") 1131 else: 1132 return connexion_format 1133 1134 def insert_file_to_table( 1135 self, 1136 file, 1137 columns: str, 1138 header_len: int = 0, 1139 sep: str = "\t", 1140 chunksize: int = 1000000, 1141 ) -> None: 1142 """ 1143 The function reads a file in chunks and inserts each chunk into a table based on the specified 1144 database format. 1145 1146 :param file: The `file` parameter is the file that you want to load into a table. It should be 1147 the path to the file on your system 1148 :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that 1149 should contain the names of the columns in the table where the data will be inserted. The column 1150 names should be separated by commas within the string. For example, if you have columns named 1151 "id", "name 1152 :type columns: str 1153 :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies 1154 the number of lines to skip at the beginning of the file before reading the actual data. This 1155 parameter allows you to skip any header information present in the file before processing the 1156 data, defaults to 0 1157 :type header_len: int (optional) 1158 :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the 1159 separator character that is used in the file being read. In this case, the default separator is 1160 set to `\t`, which represents a tab character. You can change this parameter to a different 1161 separator character if, defaults to \t 1162 :type sep: str (optional) 1163 :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time 1164 when processing the file in chunks. In the provided code snippet, the default value for 1165 `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults 1166 to 1000000 1167 :type chunksize: int (optional) 1168 """ 1169 1170 # Config 1171 chunksize = self.get_config().get("load", {}).get("chunk", chunksize) 1172 connexion_format = self.get_connexion_format() 1173 1174 log.debug("chunksize: " + str(chunksize)) 1175 1176 if chunksize: 1177 for chunk in pd.read_csv( 1178 file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c" 1179 ): 1180 if connexion_format in ["duckdb"]: 1181 sql_insert_into = ( 1182 f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk" 1183 ) 1184 self.conn.execute(sql_insert_into) 1185 elif connexion_format in ["sqlite"]: 1186 chunk.to_sql("variants", self.conn, if_exists="append", index=False) 1187 1188 def load_data( 1189 self, 1190 input_file: str = None, 1191 drop_variants_table: bool = False, 1192 sample_size: int = 20480, 1193 ) -> None: 1194 """ 1195 The `load_data` function reads a VCF file and inserts it into a table, with options to drop the 1196 table before loading the data and specify a sample size. 1197 1198 :param input_file: The path to the input file. This is the VCF file that will be loaded into the 1199 table 1200 :type input_file: str 1201 :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that 1202 determines whether the variants table should be dropped before loading the data. If set to 1203 `True`, the variants table will be dropped. If set to `False` (default), the variants table will 1204 not be dropped, defaults to False 1205 :type drop_variants_table: bool (optional) 1206 :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from 1207 the input file. If it is set to `None`, the default value of 20480 will be used, defaults to 1208 20480 1209 :type sample_size: int (optional) 1210 """ 1211 1212 log.info("Loading...") 1213 1214 # change input file 1215 if input_file: 1216 self.set_input(input_file) 1217 self.set_header() 1218 1219 # drop variants table 1220 if drop_variants_table: 1221 self.drop_variants_table() 1222 1223 # get table variants 1224 table_variants = self.get_table_variants() 1225 1226 # Access 1227 access = self.get_config().get("access", None) 1228 log.debug(f"access: {access}") 1229 1230 # Input format and compress 1231 input_format = self.get_input_format() 1232 input_compressed = self.get_input_compressed() 1233 log.debug(f"input_format: {input_format}") 1234 log.debug(f"input_compressed: {input_compressed}") 1235 1236 # input_compressed_format 1237 if input_compressed: 1238 input_compressed_format = "gzip" 1239 else: 1240 input_compressed_format = "none" 1241 log.debug(f"input_compressed_format: {input_compressed_format}") 1242 1243 # Connexion format 1244 connexion_format = self.get_connexion_format() 1245 1246 # Sample size 1247 if not sample_size: 1248 sample_size = -1 1249 log.debug(f"sample_size: {sample_size}") 1250 1251 # Load data 1252 log.debug(f"Load Data from {input_format}") 1253 1254 # DuckDB connexion 1255 if connexion_format in ["duckdb"]: 1256 1257 # Database already exists 1258 if self.input_format in ["db", "duckdb"]: 1259 1260 if connexion_format in ["duckdb"]: 1261 log.debug(f"Input file format '{self.input_format}' duckDB") 1262 else: 1263 log.error( 1264 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1265 ) 1266 raise ValueError( 1267 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1268 ) 1269 1270 # Load from existing database format 1271 else: 1272 1273 try: 1274 # Create Table or View 1275 database = Database(database=self.input) 1276 sql_from = database.get_sql_from(sample_size=sample_size) 1277 1278 if access in ["RO"]: 1279 sql_load = ( 1280 f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}" 1281 ) 1282 else: 1283 sql_load = ( 1284 f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}" 1285 ) 1286 self.conn.execute(sql_load) 1287 1288 except: 1289 # Format not available 1290 log.error(f"Input file format '{self.input_format}' not available") 1291 raise ValueError( 1292 f"Input file format '{self.input_format}' not available" 1293 ) 1294 1295 # SQLite connexion 1296 elif connexion_format in ["sqlite"] and input_format in [ 1297 "vcf", 1298 "tsv", 1299 "csv", 1300 "psv", 1301 ]: 1302 1303 # Main structure 1304 structure = { 1305 "#CHROM": "VARCHAR", 1306 "POS": "INTEGER", 1307 "ID": "VARCHAR", 1308 "REF": "VARCHAR", 1309 "ALT": "VARCHAR", 1310 "QUAL": "VARCHAR", 1311 "FILTER": "VARCHAR", 1312 "INFO": "VARCHAR", 1313 } 1314 1315 # Strcuture with samples 1316 structure_complete = structure 1317 if self.get_header_sample_list(): 1318 structure["FORMAT"] = "VARCHAR" 1319 for sample in self.get_header_sample_list(): 1320 structure_complete[sample] = "VARCHAR" 1321 1322 # Columns list for create and insert 1323 sql_create_table_columns = [] 1324 sql_create_table_columns_list = [] 1325 for column in structure_complete: 1326 column_type = structure_complete[column] 1327 sql_create_table_columns.append( 1328 f'"{column}" {column_type} default NULL' 1329 ) 1330 sql_create_table_columns_list.append(f'"{column}"') 1331 1332 # Create database 1333 log.debug(f"Create Table {table_variants}") 1334 sql_create_table_columns_sql = ", ".join(sql_create_table_columns) 1335 sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list) 1336 sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})" 1337 self.conn.execute(sql_create_table) 1338 1339 # chunksize define length of file chunk load file 1340 chunksize = 100000 1341 1342 # delimiter 1343 delimiter = file_format_delimiters.get(input_format, "\t") 1344 1345 # Load the input file 1346 with open(self.input, "rt") as input_file: 1347 1348 # Use the appropriate file handler based on the input format 1349 if input_compressed: 1350 input_file = bgzf.open(self.input, "rt") 1351 if input_format in ["vcf"]: 1352 header_len = self.get_header_length() 1353 else: 1354 header_len = 0 1355 1356 # Insert the file contents into a table 1357 self.insert_file_to_table( 1358 input_file, 1359 columns=sql_create_table_columns_list_sql, 1360 header_len=header_len, 1361 sep=delimiter, 1362 chunksize=chunksize, 1363 ) 1364 1365 else: 1366 log.error( 1367 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1368 ) 1369 raise ValueError( 1370 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1371 ) 1372 1373 # Explode INFOS fields into table fields 1374 if self.get_explode_infos(): 1375 self.explode_infos( 1376 prefix=self.get_explode_infos_prefix(), 1377 fields=self.get_explode_infos_fields(), 1378 force=True, 1379 ) 1380 1381 # Create index after insertion 1382 self.create_indexes() 1383 1384 def get_explode_infos(self) -> bool: 1385 """ 1386 The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting 1387 to False if it is not set. 1388 :return: The method is returning the value of the "explode_infos" parameter, which is a boolean 1389 value. If the parameter is not present, it will return False. 1390 """ 1391 1392 return self.get_param().get("explode", {}).get("explode_infos", False) 1393 1394 def get_explode_infos_fields( 1395 self, 1396 explode_infos_fields: str = None, 1397 remove_fields_not_in_header: bool = False, 1398 ) -> list: 1399 """ 1400 The `get_explode_infos_fields` function returns a list of exploded information fields based on 1401 the input parameter `explode_infos_fields`. 1402 1403 :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the 1404 fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a 1405 comma-separated list of field names to explode 1406 :type explode_infos_fields: str 1407 :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean 1408 flag that determines whether to remove fields that are not present in the header. If it is set 1409 to `True`, any field that is not in the header will be excluded from the list of exploded 1410 information fields. If it is set to `, defaults to False 1411 :type remove_fields_not_in_header: bool (optional) 1412 :return: The function `get_explode_infos_fields` returns a list of exploded information fields. 1413 If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty 1414 list. If the parameter is provided and its value is "ALL", it also returns an empty list. 1415 Otherwise, it returns a list of exploded information fields after removing any spaces and 1416 splitting the string by commas. 1417 """ 1418 1419 # If no fields, get it in param 1420 if not explode_infos_fields: 1421 explode_infos_fields = ( 1422 self.get_param().get("explode", {}).get("explode_infos_fields", None) 1423 ) 1424 1425 # If no fields, defined as all fields in header using keyword 1426 if not explode_infos_fields: 1427 explode_infos_fields = "*" 1428 1429 # If fields list not empty 1430 if explode_infos_fields: 1431 1432 # Input fields list 1433 if isinstance(explode_infos_fields, str): 1434 fields_input = explode_infos_fields.split(",") 1435 elif isinstance(explode_infos_fields, list): 1436 fields_input = explode_infos_fields 1437 else: 1438 fields_input = [] 1439 1440 # Fields list without * keyword 1441 fields_without_all = fields_input.copy() 1442 if "*".casefold() in (item.casefold() for item in fields_without_all): 1443 fields_without_all.remove("*") 1444 1445 # Fields in header 1446 fields_in_header = sorted(list(set(self.get_header().infos))) 1447 1448 # Construct list of fields 1449 fields_output = [] 1450 for field in fields_input: 1451 1452 # Strip field 1453 field = field.strip() 1454 1455 # format keyword * in regex 1456 if field.upper() in ["*"]: 1457 field = ".*" 1458 1459 # Find all fields with pattern 1460 r = re.compile(field) 1461 fields_search = sorted(list(filter(r.match, fields_in_header))) 1462 1463 # Remove fields input from search 1464 if field in fields_search: 1465 fields_search = [field] 1466 elif fields_search != [field]: 1467 fields_search = sorted( 1468 list(set(fields_search).difference(fields_input)) 1469 ) 1470 1471 # If field is not in header (avoid not well formatted header) 1472 if not fields_search and not remove_fields_not_in_header: 1473 fields_search = [field] 1474 1475 # Add found fields 1476 for new_field in fields_search: 1477 # Add field, if not already exists, and if it is in header (if asked) 1478 if ( 1479 new_field not in fields_output 1480 and ( 1481 not remove_fields_not_in_header 1482 or new_field in fields_in_header 1483 ) 1484 and new_field not in [".*"] 1485 ): 1486 fields_output.append(new_field) 1487 1488 return fields_output 1489 1490 else: 1491 1492 return [] 1493 1494 def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str: 1495 """ 1496 The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or 1497 the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is 1498 not provided. 1499 1500 :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a 1501 prefix to be used for exploding or expanding information 1502 :type explode_infos_prefix: str 1503 :return: the value of the variable `explode_infos_prefix`. 1504 """ 1505 1506 if not explode_infos_prefix: 1507 explode_infos_prefix = ( 1508 self.get_param().get("explode", {}).get("explode_infos_prefix", "") 1509 ) 1510 1511 return explode_infos_prefix 1512 1513 def add_column( 1514 self, 1515 table_name, 1516 column_name, 1517 column_type, 1518 default_value=None, 1519 drop: bool = False, 1520 ) -> dict: 1521 """ 1522 The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it 1523 doesn't already exist. 1524 1525 :param table_name: The name of the table to which you want to add a column 1526 :param column_name: The parameter "column_name" is the name of the column that you want to add 1527 to the table 1528 :param column_type: The `column_type` parameter specifies the data type of the column that you 1529 want to add to the table. It should be a string that represents the desired data type, such as 1530 "INTEGER", "TEXT", "REAL", etc 1531 :param default_value: The `default_value` parameter is an optional parameter that specifies the 1532 default value for the newly added column. If a default value is provided, it will be assigned to 1533 the column for any existing rows that do not have a value for that column 1534 :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column 1535 if it already exists in the table. If `drop` is set to `True`, the function will drop the 1536 existing column before adding the new column. If `drop` is set to `False` (default),, defaults 1537 to False 1538 :type drop: bool (optional) 1539 :return: a boolean value indicating whether the column was successfully added to the table. 1540 """ 1541 1542 # added 1543 added = False 1544 dropped = False 1545 1546 # Check if the column already exists in the table 1547 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1548 columns = self.get_query_to_df(query).columns.tolist() 1549 if column_name.upper() in [c.upper() for c in columns]: 1550 log.debug( 1551 f"The {column_name} column already exists in the {table_name} table" 1552 ) 1553 if drop: 1554 self.drop_column(table_name=table_name, column_name=column_name) 1555 dropped = True 1556 else: 1557 return None 1558 else: 1559 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1560 1561 # Add column in table 1562 add_column_query = ( 1563 f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """ 1564 ) 1565 if default_value is not None: 1566 add_column_query += f" DEFAULT {default_value}" 1567 self.execute_query(add_column_query) 1568 added = not dropped 1569 log.debug( 1570 f"The {column_name} column was successfully added to the {table_name} table" 1571 ) 1572 1573 if added: 1574 added_column = { 1575 "table_name": table_name, 1576 "column_name": column_name, 1577 "column_type": column_type, 1578 "default_value": default_value, 1579 } 1580 else: 1581 added_column = None 1582 1583 return added_column 1584 1585 def drop_column( 1586 self, column: dict = None, table_name: str = None, column_name: str = None 1587 ) -> bool: 1588 """ 1589 The `drop_column` function drops a specified column from a given table in a database and returns 1590 True if the column was successfully dropped, and False if the column does not exist in the 1591 table. 1592 1593 :param column: The `column` parameter is a dictionary that contains information about the column 1594 you want to drop. It has two keys: 1595 :type column: dict 1596 :param table_name: The `table_name` parameter is the name of the table from which you want to 1597 drop a column 1598 :type table_name: str 1599 :param column_name: The `column_name` parameter is the name of the column that you want to drop 1600 from the table 1601 :type column_name: str 1602 :return: a boolean value. It returns True if the column was successfully dropped from the table, 1603 and False if the column does not exist in the table. 1604 """ 1605 1606 # Find column infos 1607 if column: 1608 if isinstance(column, dict): 1609 table_name = column.get("table_name", None) 1610 column_name = column.get("column_name", None) 1611 elif isinstance(column, str): 1612 table_name = self.get_table_variants() 1613 column_name = column 1614 else: 1615 table_name = None 1616 column_name = None 1617 1618 if not table_name and not column_name: 1619 return False 1620 1621 # Removed 1622 removed = False 1623 1624 # Check if the column already exists in the table 1625 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1626 columns = self.get_query_to_df(query).columns.tolist() 1627 if column_name in columns: 1628 log.debug(f"The {column_name} column exists in the {table_name} table") 1629 else: 1630 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1631 return False 1632 1633 # Add column in table # ALTER TABLE integers DROP k 1634 add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """ 1635 self.execute_query(add_column_query) 1636 removed = True 1637 log.debug( 1638 f"The {column_name} column was successfully dropped to the {table_name} table" 1639 ) 1640 1641 return removed 1642 1643 def explode_infos( 1644 self, 1645 prefix: str = None, 1646 create_index: bool = False, 1647 fields: list = None, 1648 force: bool = False, 1649 proccess_all_fields_together: bool = False, 1650 table: str = None, 1651 ) -> list: 1652 """ 1653 The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into 1654 individual columns, returning a list of added columns. 1655 1656 :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO 1657 fields. If the `prefix` is not provided or is set to `None`, the function will use the value of 1658 `self.get_explode_infos_prefix()` as the prefix 1659 :type prefix: str 1660 :param create_index: The `create_index` parameter is a boolean flag that specifies whether to 1661 create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to 1662 `False`, indexes will not be created. The default value is `False`, defaults to False 1663 :type create_index: bool (optional) 1664 :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields 1665 that you want to explode into individual columns. If this parameter is not provided, all INFO 1666 fields will be exploded. You can specify the INFO fields you want to explode by passing them as 1667 a list to the ` 1668 :type fields: list 1669 :param force: The `force` parameter in the `explode_infos` function is a boolean flag that 1670 determines whether to drop and recreate a column if it already exists in the table. If `force` 1671 is set to `True`, the column will be dropped and recreated. If `force` is set to `False, 1672 defaults to False 1673 :type force: bool (optional) 1674 :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean 1675 flag that determines whether to process all the INFO fields together or individually. If set to 1676 `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will 1677 be processed individually. The default value is, defaults to False 1678 :type proccess_all_fields_together: bool (optional) 1679 :param table: The `table` parameter in the `explode_infos` function is used to specify the name 1680 of the table where the exploded INFO fields will be added as individual columns. If you provide 1681 a value for the `table` parameter, the function will use that table name. If the `table` 1682 parameter is 1683 :type table: str 1684 :return: The `explode_infos` function returns a list of added columns. 1685 """ 1686 1687 # drop indexes 1688 self.drop_indexes() 1689 1690 # connexion format 1691 connexion_format = self.get_connexion_format() 1692 1693 # Access 1694 access = self.get_config().get("access", None) 1695 1696 # Added columns 1697 added_columns = [] 1698 1699 if access not in ["RO"]: 1700 1701 # prefix 1702 if prefix in [None, True] or not isinstance(prefix, str): 1703 if self.get_explode_infos_prefix() not in [None, True]: 1704 prefix = self.get_explode_infos_prefix() 1705 else: 1706 prefix = "INFO/" 1707 1708 # table variants 1709 if table is not None: 1710 table_variants = table 1711 else: 1712 table_variants = self.get_table_variants(clause="select") 1713 1714 # extra infos 1715 try: 1716 extra_infos = self.get_extra_infos() 1717 except: 1718 extra_infos = [] 1719 1720 # Header infos 1721 header_infos = self.get_header().infos 1722 1723 log.debug( 1724 f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields" 1725 ) 1726 1727 sql_info_alter_table_array = [] 1728 1729 # Info fields to check 1730 fields_list = list(header_infos) 1731 if fields: 1732 fields_list += fields 1733 fields_list = set(fields_list) 1734 1735 # If no fields 1736 if not fields: 1737 fields = [] 1738 1739 # Translate fields if patterns 1740 fields = self.get_explode_infos_fields(explode_infos_fields=fields) 1741 1742 for info in fields: 1743 1744 info_id_sql = prefix + info 1745 1746 if ( 1747 info in fields_list 1748 or prefix + info in fields_list 1749 or info in extra_infos 1750 ): 1751 1752 log.debug(f"Explode INFO fields - ADD '{info}' annotations fields") 1753 1754 if info in header_infos: 1755 info_type = header_infos[info].type 1756 info_num = header_infos[info].num 1757 else: 1758 info_type = "String" 1759 info_num = 0 1760 1761 type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR") 1762 if info_num != 1: 1763 type_sql = "VARCHAR" 1764 1765 # Add field 1766 added_column = self.add_column( 1767 table_name=table_variants, 1768 column_name=info_id_sql, 1769 column_type=type_sql, 1770 default_value="null", 1771 drop=force, 1772 ) 1773 1774 if added_column: 1775 added_columns.append(added_column) 1776 1777 if added_column or force: 1778 1779 # add field to index 1780 self.index_additionnal_fields.append(info_id_sql) 1781 1782 # Update field array 1783 if connexion_format in ["duckdb"]: 1784 update_info_field = f""" 1785 "{info_id_sql}" = 1786 CASE 1787 WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL 1788 ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) 1789 END 1790 """ 1791 elif connexion_format in ["sqlite"]: 1792 update_info_field = f""" 1793 "{info_id_sql}" = 1794 CASE 1795 WHEN instr(INFO, '{info}=') = 0 THEN NULL 1796 WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1) 1797 ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1) 1798 END 1799 """ 1800 1801 sql_info_alter_table_array.append(update_info_field) 1802 1803 if sql_info_alter_table_array: 1804 1805 # By chromosomes 1806 try: 1807 chromosomes_list = list( 1808 self.get_query_to_df( 1809 f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """ 1810 )["#CHROM"] 1811 ) 1812 except: 1813 chromosomes_list = [None] 1814 1815 for chrom in chromosomes_list: 1816 log.debug(f"Explode INFO fields - Chromosome {chrom}...") 1817 1818 # Where clause 1819 where_clause = "" 1820 if chrom and len(chromosomes_list) > 1: 1821 where_clause = f""" WHERE "#CHROM" = '{chrom}' """ 1822 1823 # Update table 1824 if proccess_all_fields_together: 1825 sql_info_alter_table_array_join = ", ".join( 1826 sql_info_alter_table_array 1827 ) 1828 if sql_info_alter_table_array_join: 1829 sql_info_alter_table = f""" 1830 UPDATE {table_variants} 1831 SET {sql_info_alter_table_array_join} 1832 {where_clause} 1833 """ 1834 log.debug( 1835 f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..." 1836 ) 1837 # log.debug(sql_info_alter_table) 1838 self.conn.execute(sql_info_alter_table) 1839 else: 1840 sql_info_alter_num = 0 1841 for sql_info_alter in sql_info_alter_table_array: 1842 sql_info_alter_num += 1 1843 sql_info_alter_table = f""" 1844 UPDATE {table_variants} 1845 SET {sql_info_alter} 1846 {where_clause} 1847 """ 1848 log.debug( 1849 f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..." 1850 ) 1851 # log.debug(sql_info_alter_table) 1852 self.conn.execute(sql_info_alter_table) 1853 1854 # create indexes 1855 if create_index: 1856 self.create_indexes() 1857 1858 return added_columns 1859 1860 def create_indexes(self) -> None: 1861 """ 1862 Create indexes on the table after insertion 1863 """ 1864 1865 # Access 1866 access = self.get_config().get("access", None) 1867 1868 # get table variants 1869 table_variants = self.get_table_variants("FROM") 1870 1871 if self.get_indexing() and access not in ["RO"]: 1872 # Create index 1873 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")' 1874 self.conn.execute(sql_create_table_index) 1875 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")' 1876 self.conn.execute(sql_create_table_index) 1877 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")' 1878 self.conn.execute(sql_create_table_index) 1879 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")' 1880 self.conn.execute(sql_create_table_index) 1881 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")' 1882 self.conn.execute(sql_create_table_index) 1883 for field in self.index_additionnal_fields: 1884 sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """ 1885 self.conn.execute(sql_create_table_index) 1886 1887 def drop_indexes(self) -> None: 1888 """ 1889 Create indexes on the table after insertion 1890 """ 1891 1892 # Access 1893 access = self.get_config().get("access", None) 1894 1895 # get table variants 1896 table_variants = self.get_table_variants("FROM") 1897 1898 # Get database format 1899 connexion_format = self.get_connexion_format() 1900 1901 if access not in ["RO"]: 1902 if connexion_format in ["duckdb"]: 1903 sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'" 1904 elif connexion_format in ["sqlite"]: 1905 sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';" 1906 1907 list_indexes = self.conn.execute(sql_list_indexes) 1908 index_names = [row[0] for row in list_indexes.fetchall()] 1909 for index in index_names: 1910 sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """ 1911 self.conn.execute(sql_drop_table_index) 1912 1913 def read_vcf_header(self, f) -> list: 1914 """ 1915 It reads the header of a VCF file and returns a list of the header lines 1916 1917 :param f: the file object 1918 :return: The header lines of the VCF file. 1919 """ 1920 1921 header_list = [] 1922 for line in f: 1923 header_list.append(line) 1924 if line.startswith("#CHROM"): 1925 break 1926 return header_list 1927 1928 def read_vcf_header_file(self, file: str = None) -> list: 1929 """ 1930 The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and 1931 uncompressed files. 1932 1933 :param file: The `file` parameter is a string that represents the path to the VCF header file 1934 that you want to read. It is an optional parameter, so if you don't provide a value, it will 1935 default to `None` 1936 :type file: str 1937 :return: The function `read_vcf_header_file` returns a list. 1938 """ 1939 1940 if self.get_input_compressed(input_file=file): 1941 with bgzf.open(file, "rt") as f: 1942 return self.read_vcf_header(f=f) 1943 else: 1944 with open(file, "rt") as f: 1945 return self.read_vcf_header(f=f) 1946 1947 def execute_query(self, query: str): 1948 """ 1949 It takes a query as an argument, executes it, and returns the results 1950 1951 :param query: The query to be executed 1952 :return: The result of the query is being returned. 1953 """ 1954 if query: 1955 return self.conn.execute(query) # .fetchall() 1956 else: 1957 return None 1958 1959 def export_output( 1960 self, 1961 output_file: str | None = None, 1962 output_header: str | None = None, 1963 export_header: bool = True, 1964 query: str | None = None, 1965 parquet_partitions: list | None = None, 1966 chunk_size: int | None = None, 1967 threads: int | None = None, 1968 sort: bool = False, 1969 index: bool = False, 1970 order_by: str | None = None, 1971 ) -> bool: 1972 """ 1973 The `export_output` function exports data from a VCF file to a specified output file in various 1974 formats, including VCF, CSV, TSV, PSV, and Parquet. 1975 1976 :param output_file: The `output_file` parameter is a string that specifies the name of the 1977 output file to be generated by the function. This is where the exported data will be saved 1978 :type output_file: str 1979 :param output_header: The `output_header` parameter is a string that specifies the name of the 1980 file where the header of the VCF file will be exported. If this parameter is not provided, the 1981 header will be exported to a file with the same name as the `output_file` parameter, but with 1982 the extension " 1983 :type output_header: str 1984 :param export_header: The `export_header` parameter is a boolean flag that determines whether 1985 the header of a VCF file should be exported to a separate file or not. If `export_header` is 1986 True, the header will be exported to a file. If `export_header` is False, the header will not 1987 be, defaults to True, if output format is not VCF 1988 :type export_header: bool (optional) 1989 :param query: The `query` parameter is an optional SQL query that can be used to filter and 1990 select specific data from the VCF file before exporting it. If provided, only the data that 1991 matches the query will be exported 1992 :type query: str 1993 :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the 1994 columns to be used for partitioning the Parquet file during export. Partitioning is a way to 1995 organize data in a hierarchical directory structure based on the values of one or more columns. 1996 This can improve query performance when working with large datasets 1997 :type parquet_partitions: list 1998 :param chunk_size: The `chunk_size` parameter specifies the number of 1999 records in batch when exporting data in Parquet format. This parameter is used for 2000 partitioning the Parquet file into multiple files. 2001 :type chunk_size: int 2002 :param threads: The `threads` parameter is an optional parameter that specifies the number of 2003 threads to be used during the export process. It determines the level of parallelism and can 2004 improve the performance of the export operation. If not provided, the function will use the 2005 default number of threads 2006 :type threads: int 2007 :param sort: The `sort` parameter is a boolean flag that determines whether the output file 2008 should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the 2009 genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to 2010 False 2011 :type sort: bool (optional) 2012 :param index: The `index` parameter is a boolean flag that determines whether an index should be 2013 created on the output file. If `index` is True, an index will be created. If `index` is False, 2014 no index will be created. The default value is False, defaults to False 2015 :type index: bool (optional) 2016 :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for 2017 sorting the output file. This parameter is only applicable when exporting data in VCF format 2018 :type order_by: str 2019 :return: a boolean value. It checks if the output file exists and returns True if it does, or 2020 None if it doesn't. 2021 """ 2022 2023 # Log 2024 log.info("Exporting...") 2025 2026 # Full path 2027 output_file = full_path(output_file) 2028 output_header = full_path(output_header) 2029 2030 # Config 2031 config = self.get_config() 2032 2033 # Param 2034 param = self.get_param() 2035 2036 # Tmp files to remove 2037 tmp_to_remove = [] 2038 2039 # If no output, get it 2040 if not output_file: 2041 output_file = self.get_output() 2042 2043 # If not threads 2044 if not threads: 2045 threads = self.get_threads() 2046 2047 # Auto header name with extension 2048 if export_header or output_header: 2049 if not output_header: 2050 output_header = f"{output_file}.hdr" 2051 # Export header 2052 self.export_header(output_file=output_file) 2053 2054 # Switch off export header if VCF output 2055 output_file_type = get_file_format(output_file) 2056 if output_file_type in ["vcf"]: 2057 export_header = False 2058 tmp_to_remove.append(output_header) 2059 2060 # Chunk size 2061 if not chunk_size: 2062 chunk_size = config.get("chunk_size", None) 2063 2064 # Parquet partition 2065 if not parquet_partitions: 2066 parquet_partitions = param.get("export", {}).get("parquet_partitions", None) 2067 if parquet_partitions and isinstance(parquet_partitions, str): 2068 parquet_partitions = parquet_partitions.split(",") 2069 2070 # Order by 2071 if not order_by: 2072 order_by = param.get("export", {}).get("order_by", "") 2073 2074 # Header in output 2075 header_in_output = param.get("export", {}).get("include_header", False) 2076 2077 # Database 2078 database_source = self.get_connexion() 2079 2080 # Connexion format 2081 connexion_format = self.get_connexion_format() 2082 2083 # Explode infos 2084 if self.get_explode_infos(): 2085 self.explode_infos( 2086 prefix=self.get_explode_infos_prefix(), 2087 fields=self.get_explode_infos_fields(), 2088 force=False, 2089 ) 2090 2091 # if connexion_format in ["sqlite"] or query: 2092 if connexion_format in ["sqlite"]: 2093 2094 # Export in Parquet 2095 random_tmp = "".join( 2096 random.choice(string.ascii_lowercase) for i in range(10) 2097 ) 2098 database_source = f"""{output_file}.{random_tmp}.database_export.parquet""" 2099 tmp_to_remove.append(database_source) 2100 2101 # Table Variants 2102 table_variants = self.get_table_variants() 2103 2104 # Create export query 2105 sql_query_export_subquery = f""" 2106 SELECT * FROM {table_variants} 2107 """ 2108 2109 # Write source file 2110 fp.write(database_source, self.get_query_to_df(sql_query_export_subquery)) 2111 2112 # Create database 2113 database = Database( 2114 database=database_source, 2115 table="variants", 2116 header_file=output_header, 2117 conn_config=self.get_connexion_config(), 2118 ) 2119 2120 # Existing colomns header 2121 # existing_columns_header = database.get_header_file_columns(output_header) 2122 existing_columns_header = database.get_header_columns_from_database() 2123 2124 # Export file 2125 database.export( 2126 output_database=output_file, 2127 output_header=output_header, 2128 existing_columns_header=existing_columns_header, 2129 parquet_partitions=parquet_partitions, 2130 chunk_size=chunk_size, 2131 threads=threads, 2132 sort=sort, 2133 index=index, 2134 header_in_output=header_in_output, 2135 order_by=order_by, 2136 query=query, 2137 export_header=export_header, 2138 ) 2139 2140 # Remove 2141 remove_if_exists(tmp_to_remove) 2142 2143 return (os.path.exists(output_file) or None) and ( 2144 os.path.exists(output_file) or None 2145 ) 2146 2147 def get_extra_infos(self, table: str = None) -> list: 2148 """ 2149 The `get_extra_infos` function returns a list of columns that are in a specified table but not 2150 in the header. 2151 2152 :param table: The `table` parameter in the `get_extra_infos` function is used to specify the 2153 name of the table from which you want to retrieve the extra columns that are not present in the 2154 header. If the `table` parameter is not provided when calling the function, it will default to 2155 using the variants 2156 :type table: str 2157 :return: A list of columns that are in the specified table but not in the header of the table. 2158 """ 2159 2160 header_columns = [] 2161 2162 if not table: 2163 table = self.get_table_variants(clause="from") 2164 header_columns = self.get_header_columns() 2165 2166 # Check all columns in the database 2167 query = f""" SELECT * FROM {table} LIMIT 1 """ 2168 log.debug(f"query {query}") 2169 table_columns = self.get_query_to_df(query).columns.tolist() 2170 extra_columns = [] 2171 2172 # Construct extra infos (not in header) 2173 for column in table_columns: 2174 if column not in header_columns: 2175 extra_columns.append(column) 2176 2177 return extra_columns 2178 2179 def get_extra_infos_sql(self, table: str = None) -> str: 2180 """ 2181 It returns a string of the extra infos, separated by commas, and each extra info is surrounded 2182 by double quotes 2183 2184 :param table: The name of the table to get the extra infos from. If None, the default table is 2185 used 2186 :type table: str 2187 :return: A string of the extra infos 2188 """ 2189 2190 return ", ".join( 2191 ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)] 2192 ) 2193 2194 def export_header( 2195 self, 2196 header_name: str = None, 2197 output_file: str = None, 2198 output_file_ext: str = ".hdr", 2199 clean_header: bool = True, 2200 remove_chrom_line: bool = False, 2201 ) -> str: 2202 """ 2203 The `export_header` function takes a VCF file, extracts the header, modifies it according to 2204 specified options, and writes it to a new file. 2205 2206 :param header_name: The `header_name` parameter is the name of the header file to be created. If 2207 this parameter is not specified, the header will be written to the output file 2208 :type header_name: str 2209 :param output_file: The `output_file` parameter in the `export_header` function is used to 2210 specify the name of the output file where the header will be written. If this parameter is not 2211 provided, the header will be written to a temporary file 2212 :type output_file: str 2213 :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a 2214 string that represents the extension of the output header file. By default, it is set to ".hdr" 2215 if not specified by the user. This extension will be appended to the `output_file` name to 2216 create the final, defaults to .hdr 2217 :type output_file_ext: str (optional) 2218 :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean 2219 flag that determines whether the header should be cleaned or not. When `clean_header` is set to 2220 `True`, the function will clean the header by modifying certain lines based on a specific 2221 pattern. If `clean_header`, defaults to True 2222 :type clean_header: bool (optional) 2223 :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a 2224 boolean flag that determines whether the #CHROM line should be removed from the header before 2225 writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `, 2226 defaults to False 2227 :type remove_chrom_line: bool (optional) 2228 :return: The function `export_header` returns the name of the temporary header file that is 2229 created. 2230 """ 2231 2232 if not header_name and not output_file: 2233 output_file = self.get_output() 2234 2235 if self.get_header(): 2236 2237 # Get header object 2238 header_obj = self.get_header() 2239 2240 # Create database 2241 db_for_header = Database(database=self.get_input()) 2242 2243 # Get real columns in the file 2244 db_header_columns = db_for_header.get_columns() 2245 2246 with tempfile.TemporaryDirectory() as tmpdir: 2247 2248 # Write header file 2249 header_file_tmp = os.path.join(tmpdir, "header") 2250 f = open(header_file_tmp, "w") 2251 vcf.Writer(f, header_obj) 2252 f.close() 2253 2254 # Replace #CHROM line with rel columns 2255 header_list = db_for_header.read_header_file( 2256 header_file=header_file_tmp 2257 ) 2258 header_list[-1] = "\t".join(db_header_columns) 2259 2260 # Remove CHROM line 2261 if remove_chrom_line: 2262 header_list.pop() 2263 2264 # Clean header 2265 if clean_header: 2266 header_list_clean = [] 2267 for head in header_list: 2268 # Clean head for malformed header 2269 head_clean = head 2270 head_clean = re.subn( 2271 "##FORMAT=<ID=(.*),Number=(.*),Type=Flag", 2272 r"##FORMAT=<ID=\1,Number=\2,Type=String", 2273 head_clean, 2274 2, 2275 )[0] 2276 # Write header 2277 header_list_clean.append(head_clean) 2278 header_list = header_list_clean 2279 2280 tmp_header_name = output_file + output_file_ext 2281 2282 f = open(tmp_header_name, "w") 2283 for line in header_list: 2284 f.write(line) 2285 f.close() 2286 2287 return tmp_header_name 2288 2289 def export_variant_vcf( 2290 self, 2291 vcf_file, 2292 remove_info: bool = False, 2293 add_samples: bool = True, 2294 list_samples: list = [], 2295 where_clause: str = "", 2296 index: bool = False, 2297 threads: int | None = None, 2298 ) -> bool | None: 2299 """ 2300 The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to 2301 remove INFO field, add samples, and control compression and indexing. 2302 2303 :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be 2304 written to. It is the output file that will contain the filtered VCF data based on the specified 2305 parameters 2306 :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a 2307 boolean flag that determines whether to remove the INFO field from the output VCF file. If set 2308 to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included 2309 in, defaults to False 2310 :type remove_info: bool (optional) 2311 :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether 2312 the samples should be added to the VCF file or not. If set to True, the samples will be added. 2313 If set to False, the samples will be removed. The default value is True, defaults to True 2314 :type add_samples: bool (optional) 2315 :param list_samples: The `list_samples` parameter is a list of samples that you want to include 2316 in the output VCF file. By default, all samples will be included. If you provide a list of 2317 samples, only those samples will be included in the output file 2318 :type list_samples: list 2319 :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that 2320 determines whether or not to create an index for the output VCF file. If `index` is set to 2321 `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False 2322 :type index: bool (optional) 2323 :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the 2324 number of threads to use for exporting the VCF file. It determines how many parallel threads 2325 will be used during the export process. More threads can potentially speed up the export process 2326 by utilizing multiple cores of the processor. If 2327 :type threads: int | None 2328 :return: The `export_variant_vcf` function returns the result of calling the `export_output` 2329 method with various parameters including the output file, query, threads, sort flag, and index 2330 flag. The `export_output` method is responsible for exporting the VCF data based on the 2331 specified parameters and configurations provided in the `export_variant_vcf` function. 2332 """ 2333 2334 # Config 2335 config = self.get_config() 2336 2337 # Extract VCF 2338 log.debug("Export VCF...") 2339 2340 # Table variants 2341 table_variants = self.get_table_variants() 2342 2343 # Threads 2344 if not threads: 2345 threads = self.get_threads() 2346 2347 # Info fields 2348 if remove_info: 2349 if not isinstance(remove_info, str): 2350 remove_info = "." 2351 info_field = f"""'{remove_info}' as INFO""" 2352 else: 2353 info_field = "INFO" 2354 2355 # Samples fields 2356 if add_samples: 2357 if not list_samples: 2358 list_samples = self.get_header_sample_list() 2359 if list_samples: 2360 samples_fields = " , FORMAT , " + " , ".join(list_samples) 2361 else: 2362 samples_fields = "" 2363 log.debug(f"samples_fields: {samples_fields}") 2364 else: 2365 samples_fields = "" 2366 2367 # Where clause 2368 if where_clause is None: 2369 where_clause = "" 2370 2371 # Variants 2372 select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """ 2373 sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """ 2374 log.debug(f"sql_query_select={sql_query_select}") 2375 2376 return self.export_output( 2377 output_file=vcf_file, 2378 output_header=None, 2379 export_header=True, 2380 query=sql_query_select, 2381 parquet_partitions=None, 2382 chunk_size=config.get("chunk_size", None), 2383 threads=threads, 2384 sort=True, 2385 index=index, 2386 order_by=None, 2387 ) 2388 2389 def run_commands(self, commands: list = [], threads: int = 1) -> None: 2390 """ 2391 It takes a list of commands and runs them in parallel using the number of threads specified 2392 2393 :param commands: A list of commands to run 2394 :param threads: The number of threads to use, defaults to 1 (optional) 2395 """ 2396 2397 run_parallel_commands(commands, threads) 2398 2399 def get_threads(self, default: int = 1) -> int: 2400 """ 2401 This function returns the number of threads to use for a job, with a default value of 1 if not 2402 specified. 2403 2404 :param default: The `default` parameter in the `get_threads` method is used to specify the 2405 default number of threads to use if no specific value is provided. If no value is provided for 2406 the `threads` parameter in the configuration or input parameters, the `default` value will be 2407 used, defaults to 1 2408 :type default: int (optional) 2409 :return: the number of threads to use for the current job. 2410 """ 2411 2412 # Config 2413 config = self.get_config() 2414 2415 # Param 2416 param = self.get_param() 2417 2418 # Input threads 2419 input_thread = param.get("threads", config.get("threads", None)) 2420 2421 # Check threads 2422 if not input_thread: 2423 threads = default 2424 elif int(input_thread) <= 0: 2425 threads = os.cpu_count() 2426 else: 2427 threads = int(input_thread) 2428 return threads 2429 2430 def get_memory(self, default: str = None) -> str: 2431 """ 2432 This function retrieves the memory value from parameters or configuration with a default value 2433 if not found. 2434 2435 :param default: The `get_memory` function takes in a default value as a string parameter. This 2436 default value is used as a fallback in case the `memory` parameter is not provided in the 2437 `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary, 2438 the function 2439 :type default: str 2440 :return: The `get_memory` function returns a string value representing the memory parameter. If 2441 the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will 2442 return the default value provided as an argument to the function. 2443 """ 2444 2445 # Config 2446 config = self.get_config() 2447 2448 # Param 2449 param = self.get_param() 2450 2451 # Input threads 2452 input_memory = param.get("memory", config.get("memory", None)) 2453 2454 # Check threads 2455 if input_memory: 2456 memory = input_memory 2457 else: 2458 memory = default 2459 2460 return memory 2461 2462 def update_from_vcf(self, vcf_file: str) -> None: 2463 """ 2464 > If the database is duckdb, then use the parquet method, otherwise use the sqlite method 2465 2466 :param vcf_file: the path to the VCF file 2467 """ 2468 2469 connexion_format = self.get_connexion_format() 2470 2471 if connexion_format in ["duckdb"]: 2472 self.update_from_vcf_duckdb(vcf_file) 2473 elif connexion_format in ["sqlite"]: 2474 self.update_from_vcf_sqlite(vcf_file) 2475 2476 def update_from_vcf_duckdb(self, vcf_file: str) -> None: 2477 """ 2478 It takes a VCF file and updates the INFO column of the variants table in the database with the 2479 INFO column of the VCF file 2480 2481 :param vcf_file: the path to the VCF file 2482 """ 2483 2484 # varaints table 2485 table_variants = self.get_table_variants() 2486 2487 # Loading VCF into temporaire table 2488 skip = self.get_header_length(file=vcf_file) 2489 vcf_df = pd.read_csv( 2490 vcf_file, 2491 sep="\t", 2492 engine="c", 2493 skiprows=skip, 2494 header=0, 2495 low_memory=False, 2496 ) 2497 sql_query_update = f""" 2498 UPDATE {table_variants} as table_variants 2499 SET INFO = concat( 2500 CASE 2501 WHEN INFO NOT IN ('', '.') 2502 THEN INFO 2503 ELSE '' 2504 END, 2505 ( 2506 SELECT 2507 concat( 2508 CASE 2509 WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.') 2510 THEN ';' 2511 ELSE '' 2512 END 2513 , 2514 CASE 2515 WHEN table_parquet.INFO NOT IN ('','.') 2516 THEN table_parquet.INFO 2517 ELSE '' 2518 END 2519 ) 2520 FROM vcf_df as table_parquet 2521 WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR) 2522 AND table_parquet.\"POS\" = table_variants.\"POS\" 2523 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 2524 AND table_parquet.\"REF\" = table_variants.\"REF\" 2525 AND table_parquet.INFO NOT IN ('','.') 2526 ) 2527 ) 2528 ; 2529 """ 2530 self.conn.execute(sql_query_update) 2531 2532 def update_from_vcf_sqlite(self, vcf_file: str) -> None: 2533 """ 2534 It creates a temporary table in the SQLite database, loads the VCF file into the temporary 2535 table, then updates the INFO column of the variants table with the INFO column of the temporary 2536 table 2537 2538 :param vcf_file: The path to the VCF file you want to update the database with 2539 """ 2540 2541 # Create a temporary table for the VCF 2542 table_vcf = "tmp_vcf" 2543 sql_create = ( 2544 f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0" 2545 ) 2546 self.conn.execute(sql_create) 2547 2548 # Loading VCF into temporaire table 2549 vcf_df = pd.read_csv( 2550 vcf_file, sep="\t", comment="#", header=None, low_memory=False 2551 ) 2552 vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] 2553 vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False) 2554 2555 # Update table 'variants' with VCF data 2556 # warning: CONCAT as || operator 2557 sql_query_update = f""" 2558 UPDATE variants as table_variants 2559 SET INFO = CASE 2560 WHEN INFO NOT IN ('', '.') 2561 THEN INFO 2562 ELSE '' 2563 END || 2564 ( 2565 SELECT 2566 CASE 2567 WHEN table_variants.INFO NOT IN ('','.') 2568 AND table_vcf.INFO NOT IN ('','.') 2569 THEN ';' 2570 ELSE '' 2571 END || 2572 CASE 2573 WHEN table_vcf.INFO NOT IN ('','.') 2574 THEN table_vcf.INFO 2575 ELSE '' 2576 END 2577 FROM {table_vcf} as table_vcf 2578 WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\" 2579 AND table_vcf.\"POS\" = table_variants.\"POS\" 2580 AND table_vcf.\"ALT\" = table_variants.\"ALT\" 2581 AND table_vcf.\"REF\" = table_variants.\"REF\" 2582 ) 2583 """ 2584 self.conn.execute(sql_query_update) 2585 2586 # Drop temporary table 2587 sql_drop = f"DROP TABLE {table_vcf}" 2588 self.conn.execute(sql_drop) 2589 2590 def drop_variants_table(self) -> None: 2591 """ 2592 > This function drops the variants table 2593 """ 2594 2595 table_variants = self.get_table_variants() 2596 sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}" 2597 self.conn.execute(sql_table_variants) 2598 2599 def set_variant_id( 2600 self, variant_id_column: str = "variant_id", force: bool = None 2601 ) -> str: 2602 """ 2603 It adds a column to the variants table called `variant_id` and populates it with a hash of the 2604 `#CHROM`, `POS`, `REF`, and `ALT` columns 2605 2606 :param variant_id_column: The name of the column to be created in the variants table, defaults 2607 to variant_id 2608 :type variant_id_column: str (optional) 2609 :param force: If True, the variant_id column will be created even if it already exists 2610 :type force: bool 2611 :return: The name of the column that contains the variant_id 2612 """ 2613 2614 # Assembly 2615 assembly = self.get_param().get( 2616 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 2617 ) 2618 2619 # INFO/Tag prefix 2620 prefix = self.get_explode_infos_prefix() 2621 2622 # Explode INFO/SVTYPE 2623 added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"]) 2624 2625 # variants table 2626 table_variants = self.get_table_variants() 2627 2628 # variant_id column 2629 if not variant_id_column: 2630 variant_id_column = "variant_id" 2631 2632 # Creta variant_id column 2633 if "variant_id" not in self.get_extra_infos() or force: 2634 2635 # Create column 2636 self.add_column( 2637 table_name=table_variants, 2638 column_name=variant_id_column, 2639 column_type="UBIGINT", 2640 default_value="0", 2641 ) 2642 2643 # Update column 2644 self.conn.execute( 2645 f""" 2646 UPDATE {table_variants} 2647 SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"') 2648 """ 2649 ) 2650 2651 # Remove added columns 2652 for added_column in added_columns: 2653 self.drop_column(column=added_column) 2654 2655 # return variant_id column name 2656 return variant_id_column 2657 2658 def get_variant_id_column( 2659 self, variant_id_column: str = "variant_id", force: bool = None 2660 ) -> str: 2661 """ 2662 This function returns the variant_id column name 2663 2664 :param variant_id_column: The name of the column in the dataframe that contains the variant IDs, 2665 defaults to variant_id 2666 :type variant_id_column: str (optional) 2667 :param force: If True, will force the variant_id to be set to the value of variant_id_column. If 2668 False, will only set the variant_id if it is not already set. If None, will set the variant_id 2669 if it is not already set, or if it is set 2670 :type force: bool 2671 :return: The variant_id column name. 2672 """ 2673 2674 return self.set_variant_id(variant_id_column=variant_id_column, force=force) 2675 2676 ### 2677 # Annotation 2678 ### 2679 2680 def scan_databases( 2681 self, 2682 database_formats: list = ["parquet"], 2683 database_releases: list = ["current"], 2684 ) -> dict: 2685 """ 2686 The function `scan_databases` scans for available databases based on specified formats and 2687 releases. 2688 2689 :param database_formats: The `database_formats` parameter is a list that specifies the formats 2690 of the databases to be scanned. In this case, the accepted format is "parquet" 2691 :type database_formats: list ["parquet"] 2692 :param database_releases: The `database_releases` parameter is a list that specifies the 2693 releases of the databases to be scanned. In the provided function, the default value for 2694 `database_releases` is set to `["current"]`, meaning that by default, the function will scan 2695 databases that are in the "current" 2696 :type database_releases: list 2697 :return: The function `scan_databases` returns a dictionary containing information about 2698 databases that match the specified formats and releases. 2699 """ 2700 2701 # Config 2702 config = self.get_config() 2703 2704 # Param 2705 param = self.get_param() 2706 2707 # Param - Assembly 2708 assembly = param.get("assembly", config.get("assembly", None)) 2709 if not assembly: 2710 assembly = DEFAULT_ASSEMBLY 2711 log.warning(f"Default assembly '{assembly}'") 2712 2713 # Scan for availabled databases 2714 log.info( 2715 f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..." 2716 ) 2717 databases_infos_dict = databases_infos( 2718 database_folder_releases=database_releases, 2719 database_formats=database_formats, 2720 assembly=assembly, 2721 config=config, 2722 ) 2723 log.info( 2724 f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found" 2725 ) 2726 2727 return databases_infos_dict 2728 2729 def annotation(self) -> None: 2730 """ 2731 It annotates the VCF file with the annotations specified in the config file. 2732 """ 2733 2734 # Config 2735 config = self.get_config() 2736 2737 # Param 2738 param = self.get_param() 2739 2740 # Param - Assembly 2741 assembly = param.get("assembly", config.get("assembly", None)) 2742 if not assembly: 2743 assembly = DEFAULT_ASSEMBLY 2744 log.warning(f"Default assembly '{assembly}'") 2745 2746 # annotations databases folders 2747 annotations_databases = set( 2748 config.get("folders", {}) 2749 .get("databases", {}) 2750 .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER]) 2751 + config.get("folders", {}) 2752 .get("databases", {}) 2753 .get("parquet", ["~/howard/databases/parquet/current"]) 2754 + config.get("folders", {}) 2755 .get("databases", {}) 2756 .get("bcftools", ["~/howard/databases/bcftools/current"]) 2757 ) 2758 2759 # Get param annotations 2760 if param.get("annotations", None) and isinstance( 2761 param.get("annotations", None), str 2762 ): 2763 log.debug(param.get("annotations", None)) 2764 param_annotation_list = param.get("annotations").split(",") 2765 else: 2766 param_annotation_list = [] 2767 2768 # Each tools param 2769 if param.get("annotation_parquet", None) != None: 2770 log.debug( 2771 f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}""" 2772 ) 2773 if isinstance(param.get("annotation_parquet", None), list): 2774 param_annotation_list.append(",".join(param.get("annotation_parquet"))) 2775 else: 2776 param_annotation_list.append(param.get("annotation_parquet")) 2777 if param.get("annotation_snpsift", None) != None: 2778 if isinstance(param.get("annotation_snpsift", None), list): 2779 param_annotation_list.append( 2780 "snpsift:" 2781 + "+".join(param.get("annotation_snpsift")).replace(",", "+") 2782 ) 2783 else: 2784 param_annotation_list.append( 2785 "snpsift:" + param.get("annotation_snpsift").replace(",", "+") 2786 ) 2787 if param.get("annotation_snpeff", None) != None: 2788 param_annotation_list.append("snpeff:" + param.get("annotation_snpeff")) 2789 if param.get("annotation_bcftools", None) != None: 2790 if isinstance(param.get("annotation_bcftools", None), list): 2791 param_annotation_list.append( 2792 "bcftools:" 2793 + "+".join(param.get("annotation_bcftools")).replace(",", "+") 2794 ) 2795 else: 2796 param_annotation_list.append( 2797 "bcftools:" + param.get("annotation_bcftools").replace(",", "+") 2798 ) 2799 if param.get("annotation_annovar", None) != None: 2800 param_annotation_list.append("annovar:" + param.get("annotation_annovar")) 2801 if param.get("annotation_exomiser", None) != None: 2802 param_annotation_list.append("exomiser:" + param.get("annotation_exomiser")) 2803 if param.get("annotation_splice", None) != None: 2804 param_annotation_list.append("splice:" + param.get("annotation_splice")) 2805 2806 # Merge param annotations list 2807 param["annotations"] = ",".join(param_annotation_list) 2808 2809 # debug 2810 log.debug(f"param_annotations={param['annotations']}") 2811 2812 if param.get("annotations"): 2813 2814 # Log 2815 # log.info("Annotations - Check annotation parameters") 2816 2817 if not "annotation" in param: 2818 param["annotation"] = {} 2819 2820 # List of annotations parameters 2821 annotations_list_input = {} 2822 if isinstance(param.get("annotations", None), str): 2823 annotation_file_list = [ 2824 value for value in param.get("annotations", "").split(",") 2825 ] 2826 for annotation_file in annotation_file_list: 2827 annotations_list_input[annotation_file] = {"INFO": None} 2828 else: 2829 annotations_list_input = param.get("annotations", {}) 2830 2831 log.info(f"Quick Annotations:") 2832 for annotation_key in list(annotations_list_input.keys()): 2833 log.info(f" {annotation_key}") 2834 2835 # List of annotations and associated fields 2836 annotations_list = {} 2837 2838 for annotation_file in annotations_list_input: 2839 2840 # Explode annotations if ALL 2841 if ( 2842 annotation_file.upper() == "ALL" 2843 or annotation_file.upper().startswith("ALL:") 2844 ): 2845 2846 # check ALL parameters (formats, releases) 2847 annotation_file_split = annotation_file.split(":") 2848 database_formats = "parquet" 2849 database_releases = "current" 2850 for annotation_file_option in annotation_file_split[1:]: 2851 database_all_options_split = annotation_file_option.split("=") 2852 if database_all_options_split[0] == "format": 2853 database_formats = database_all_options_split[1].split("+") 2854 if database_all_options_split[0] == "release": 2855 database_releases = database_all_options_split[1].split("+") 2856 2857 # Scan for availabled databases 2858 databases_infos_dict = self.scan_databases( 2859 database_formats=database_formats, 2860 database_releases=database_releases, 2861 ) 2862 2863 # Add found databases in annotation parameters 2864 for database_infos in databases_infos_dict.keys(): 2865 annotations_list[database_infos] = {"INFO": None} 2866 2867 else: 2868 annotations_list[annotation_file] = annotations_list_input[ 2869 annotation_file 2870 ] 2871 2872 # Check each databases 2873 if len(annotations_list): 2874 2875 log.info( 2876 f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..." 2877 ) 2878 2879 for annotation_file in annotations_list: 2880 2881 # Init 2882 annotations = annotations_list.get(annotation_file, None) 2883 2884 # Annotation snpEff 2885 if annotation_file.startswith("snpeff"): 2886 2887 log.debug(f"Quick Annotation snpEff") 2888 2889 if "snpeff" not in param["annotation"]: 2890 param["annotation"]["snpeff"] = {} 2891 2892 if "options" not in param["annotation"]["snpeff"]: 2893 param["annotation"]["snpeff"]["options"] = "" 2894 2895 # snpEff options in annotations 2896 param["annotation"]["snpeff"]["options"] = "".join( 2897 annotation_file.split(":")[1:] 2898 ) 2899 2900 # Annotation Annovar 2901 elif annotation_file.startswith("annovar"): 2902 2903 log.debug(f"Quick Annotation Annovar") 2904 2905 if "annovar" not in param["annotation"]: 2906 param["annotation"]["annovar"] = {} 2907 2908 if "annotations" not in param["annotation"]["annovar"]: 2909 param["annotation"]["annovar"]["annotations"] = {} 2910 2911 # Options 2912 annotation_file_split = annotation_file.split(":") 2913 for annotation_file_annotation in annotation_file_split[1:]: 2914 if annotation_file_annotation: 2915 param["annotation"]["annovar"]["annotations"][ 2916 annotation_file_annotation 2917 ] = annotations 2918 2919 # Annotation Exomiser 2920 elif annotation_file.startswith("exomiser"): 2921 2922 log.debug(f"Quick Annotation Exomiser") 2923 2924 param["annotation"]["exomiser"] = params_string_to_dict( 2925 annotation_file 2926 ) 2927 2928 # Annotation Splice 2929 elif annotation_file.startswith("splice"): 2930 2931 log.debug(f"Quick Annotation Splice") 2932 2933 param["annotation"]["splice"] = params_string_to_dict( 2934 annotation_file 2935 ) 2936 2937 # Annotation Parquet or BCFTOOLS 2938 else: 2939 2940 # Tools detection 2941 if annotation_file.startswith("bcftools:"): 2942 annotation_tool_initial = "bcftools" 2943 annotation_file = ":".join(annotation_file.split(":")[1:]) 2944 elif annotation_file.startswith("snpsift:"): 2945 annotation_tool_initial = "snpsift" 2946 annotation_file = ":".join(annotation_file.split(":")[1:]) 2947 else: 2948 annotation_tool_initial = None 2949 2950 # list of files 2951 annotation_file_list = annotation_file.replace("+", ":").split( 2952 ":" 2953 ) 2954 2955 for annotation_file in annotation_file_list: 2956 2957 if annotation_file: 2958 2959 # Annotation tool initial 2960 annotation_tool = annotation_tool_initial 2961 2962 # Find file 2963 annotation_file_found = None 2964 2965 # Expand user 2966 annotation_file = full_path(annotation_file) 2967 2968 if os.path.exists(annotation_file): 2969 annotation_file_found = annotation_file 2970 2971 else: 2972 # Find within assembly folders 2973 for annotations_database in annotations_databases: 2974 found_files = find_all( 2975 annotation_file, 2976 os.path.join( 2977 annotations_database, assembly 2978 ), 2979 ) 2980 if len(found_files) > 0: 2981 annotation_file_found = found_files[0] 2982 break 2983 if not annotation_file_found and not assembly: 2984 # Find within folders 2985 for ( 2986 annotations_database 2987 ) in annotations_databases: 2988 found_files = find_all( 2989 annotation_file, annotations_database 2990 ) 2991 if len(found_files) > 0: 2992 annotation_file_found = found_files[0] 2993 break 2994 log.debug( 2995 f"for {annotation_file} annotation_file_found={annotation_file_found}" 2996 ) 2997 2998 # Full path 2999 annotation_file_found = full_path(annotation_file_found) 3000 3001 if annotation_file_found: 3002 3003 database = Database(database=annotation_file_found) 3004 quick_annotation_format = database.get_format() 3005 quick_annotation_is_compressed = ( 3006 database.is_compressed() 3007 ) 3008 quick_annotation_is_indexed = os.path.exists( 3009 f"{annotation_file_found}.tbi" 3010 ) 3011 bcftools_preference = False 3012 3013 # Check Annotation Tool 3014 if not annotation_tool: 3015 if ( 3016 bcftools_preference 3017 and quick_annotation_format 3018 in ["vcf", "bed"] 3019 and quick_annotation_is_compressed 3020 and quick_annotation_is_indexed 3021 ): 3022 annotation_tool = "bcftools" 3023 elif quick_annotation_format in [ 3024 "vcf", 3025 "bed", 3026 "tsv", 3027 "tsv", 3028 "csv", 3029 "json", 3030 "tbl", 3031 "parquet", 3032 "duckdb", 3033 ]: 3034 annotation_tool = "parquet" 3035 else: 3036 log.error( 3037 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3038 ) 3039 raise ValueError( 3040 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3041 ) 3042 3043 log.debug( 3044 f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}" 3045 ) 3046 3047 # Annotation Tool dispatch 3048 if annotation_tool: 3049 if annotation_tool not in param["annotation"]: 3050 param["annotation"][annotation_tool] = {} 3051 if ( 3052 "annotations" 3053 not in param["annotation"][annotation_tool] 3054 ): 3055 param["annotation"][annotation_tool][ 3056 "annotations" 3057 ] = {} 3058 param["annotation"][annotation_tool][ 3059 "annotations" 3060 ][annotation_file_found] = annotations 3061 3062 else: 3063 log.error( 3064 f"Quick Annotation File {annotation_file} does NOT exist" 3065 ) 3066 3067 self.set_param(param) 3068 3069 if param.get("annotation", None): 3070 log.info("Annotations") 3071 if param.get("annotation", {}).get("parquet", None): 3072 log.info("Annotations 'parquet'...") 3073 self.annotation_parquet() 3074 if param.get("annotation", {}).get("bcftools", None): 3075 log.info("Annotations 'bcftools'...") 3076 self.annotation_bcftools() 3077 if param.get("annotation", {}).get("snpsift", None): 3078 log.info("Annotations 'snpsift'...") 3079 self.annotation_snpsift() 3080 if param.get("annotation", {}).get("annovar", None): 3081 log.info("Annotations 'annovar'...") 3082 self.annotation_annovar() 3083 if param.get("annotation", {}).get("snpeff", None): 3084 log.info("Annotations 'snpeff'...") 3085 self.annotation_snpeff() 3086 if param.get("annotation", {}).get("exomiser", None) is not None: 3087 log.info("Annotations 'exomiser'...") 3088 self.annotation_exomiser() 3089 if param.get("annotation", {}).get("splice", None) is not None: 3090 log.info("Annotations 'splice' ...") 3091 self.annotation_splice() 3092 3093 # Explode INFOS fields into table fields 3094 if self.get_explode_infos(): 3095 self.explode_infos( 3096 prefix=self.get_explode_infos_prefix(), 3097 fields=self.get_explode_infos_fields(), 3098 force=True, 3099 ) 3100 3101 def annotation_snpsift(self, threads: int = None) -> None: 3102 """ 3103 This function annotate with bcftools 3104 3105 :param threads: Number of threads to use 3106 :return: the value of the variable "return_value". 3107 """ 3108 3109 # DEBUG 3110 log.debug("Start annotation with bcftools databases") 3111 3112 # Threads 3113 if not threads: 3114 threads = self.get_threads() 3115 log.debug("Threads: " + str(threads)) 3116 3117 # Config 3118 config = self.get_config() 3119 log.debug("Config: " + str(config)) 3120 3121 # Config - snpSift 3122 snpsift_bin_command = get_bin_command( 3123 bin="SnpSift.jar", 3124 tool="snpsift", 3125 bin_type="jar", 3126 config=config, 3127 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 3128 ) 3129 if not snpsift_bin_command: 3130 msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'" 3131 log.error(msg_err) 3132 raise ValueError(msg_err) 3133 3134 # Config - bcftools 3135 bcftools_bin_command = get_bin_command( 3136 bin="bcftools", 3137 tool="bcftools", 3138 bin_type="bin", 3139 config=config, 3140 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3141 ) 3142 if not bcftools_bin_command: 3143 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3144 log.error(msg_err) 3145 raise ValueError(msg_err) 3146 3147 # Config - BCFTools databases folders 3148 databases_folders = set( 3149 self.get_config() 3150 .get("folders", {}) 3151 .get("databases", {}) 3152 .get("annotations", ["."]) 3153 + self.get_config() 3154 .get("folders", {}) 3155 .get("databases", {}) 3156 .get("bcftools", ["."]) 3157 ) 3158 log.debug("Databases annotations: " + str(databases_folders)) 3159 3160 # Param 3161 annotations = ( 3162 self.get_param() 3163 .get("annotation", {}) 3164 .get("snpsift", {}) 3165 .get("annotations", None) 3166 ) 3167 log.debug("Annotations: " + str(annotations)) 3168 3169 # Assembly 3170 assembly = self.get_param().get( 3171 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3172 ) 3173 3174 # Data 3175 table_variants = self.get_table_variants() 3176 3177 # Check if not empty 3178 log.debug("Check if not empty") 3179 sql_query_chromosomes = ( 3180 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3181 ) 3182 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3183 if not sql_query_chromosomes_df["count"][0]: 3184 log.info(f"VCF empty") 3185 return 3186 3187 # VCF header 3188 vcf_reader = self.get_header() 3189 log.debug("Initial header: " + str(vcf_reader.infos)) 3190 3191 # Existing annotations 3192 for vcf_annotation in self.get_header().infos: 3193 3194 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3195 log.debug( 3196 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3197 ) 3198 3199 if annotations: 3200 3201 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3202 3203 # Export VCF file 3204 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3205 3206 # Init 3207 commands = {} 3208 3209 for annotation in annotations: 3210 annotation_fields = annotations[annotation] 3211 3212 # Annotation Name 3213 annotation_name = os.path.basename(annotation) 3214 3215 if not annotation_fields: 3216 annotation_fields = {"INFO": None} 3217 3218 log.debug(f"Annotation '{annotation_name}'") 3219 log.debug( 3220 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3221 ) 3222 3223 # Create Database 3224 database = Database( 3225 database=annotation, 3226 databases_folders=databases_folders, 3227 assembly=assembly, 3228 ) 3229 3230 # Find files 3231 db_file = database.get_database() 3232 db_file = full_path(db_file) 3233 db_hdr_file = database.get_header_file() 3234 db_hdr_file = full_path(db_hdr_file) 3235 db_file_type = database.get_format() 3236 db_tbi_file = f"{db_file}.tbi" 3237 db_file_compressed = database.is_compressed() 3238 3239 # Check if compressed 3240 if not db_file_compressed: 3241 log.error( 3242 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3243 ) 3244 raise ValueError( 3245 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3246 ) 3247 3248 # Check if indexed 3249 if not os.path.exists(db_tbi_file): 3250 log.error( 3251 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3252 ) 3253 raise ValueError( 3254 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3255 ) 3256 3257 # Check index - try to create if not exists 3258 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3259 log.error("Annotation failed: database not valid") 3260 log.error(f"Annotation annotation file: {db_file}") 3261 log.error(f"Annotation annotation header: {db_hdr_file}") 3262 log.error(f"Annotation annotation index: {db_tbi_file}") 3263 raise ValueError( 3264 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3265 ) 3266 else: 3267 3268 log.debug( 3269 f"Annotation '{annotation}' - file: " 3270 + str(db_file) 3271 + " and " 3272 + str(db_hdr_file) 3273 ) 3274 3275 # Load header as VCF object 3276 db_hdr_vcf = Variants(input=db_hdr_file) 3277 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3278 log.debug( 3279 "Annotation database header: " 3280 + str(db_hdr_vcf_header_infos) 3281 ) 3282 3283 # For all fields in database 3284 annotation_fields_full = False 3285 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3286 annotation_fields = { 3287 key: key for key in db_hdr_vcf_header_infos 3288 } 3289 log.debug( 3290 "Annotation database header - All annotations added: " 3291 + str(annotation_fields) 3292 ) 3293 annotation_fields_full = True 3294 3295 # # Create file for field rename 3296 # log.debug("Create file for field rename") 3297 # tmp_rename = NamedTemporaryFile( 3298 # prefix=self.get_prefix(), 3299 # dir=self.get_tmp_dir(), 3300 # suffix=".rename", 3301 # delete=False, 3302 # ) 3303 # tmp_rename_name = tmp_rename.name 3304 # tmp_files.append(tmp_rename_name) 3305 3306 # Number of fields 3307 nb_annotation_field = 0 3308 annotation_list = [] 3309 annotation_infos_rename_list = [] 3310 3311 for annotation_field in annotation_fields: 3312 3313 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3314 annotation_fields_new_name = annotation_fields.get( 3315 annotation_field, annotation_field 3316 ) 3317 if not annotation_fields_new_name: 3318 annotation_fields_new_name = annotation_field 3319 3320 # Check if field is in DB and if field is not elready in input data 3321 if ( 3322 annotation_field in db_hdr_vcf.get_header().infos 3323 and annotation_fields_new_name 3324 not in self.get_header().infos 3325 ): 3326 3327 log.info( 3328 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3329 ) 3330 3331 # BCFTools annotate param to rename fields 3332 if annotation_field != annotation_fields_new_name: 3333 annotation_infos_rename_list.append( 3334 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3335 ) 3336 3337 # Add INFO field to header 3338 db_hdr_vcf_header_infos_number = ( 3339 db_hdr_vcf_header_infos[annotation_field].num or "." 3340 ) 3341 db_hdr_vcf_header_infos_type = ( 3342 db_hdr_vcf_header_infos[annotation_field].type 3343 or "String" 3344 ) 3345 db_hdr_vcf_header_infos_description = ( 3346 db_hdr_vcf_header_infos[annotation_field].desc 3347 or f"{annotation_field} description" 3348 ) 3349 db_hdr_vcf_header_infos_source = ( 3350 db_hdr_vcf_header_infos[annotation_field].source 3351 or "unknown" 3352 ) 3353 db_hdr_vcf_header_infos_version = ( 3354 db_hdr_vcf_header_infos[annotation_field].version 3355 or "unknown" 3356 ) 3357 3358 vcf_reader.infos[annotation_fields_new_name] = ( 3359 vcf.parser._Info( 3360 annotation_fields_new_name, 3361 db_hdr_vcf_header_infos_number, 3362 db_hdr_vcf_header_infos_type, 3363 db_hdr_vcf_header_infos_description, 3364 db_hdr_vcf_header_infos_source, 3365 db_hdr_vcf_header_infos_version, 3366 self.code_type_map[ 3367 db_hdr_vcf_header_infos_type 3368 ], 3369 ) 3370 ) 3371 3372 annotation_list.append(annotation_field) 3373 3374 nb_annotation_field += 1 3375 3376 else: 3377 3378 if ( 3379 annotation_field 3380 not in db_hdr_vcf.get_header().infos 3381 ): 3382 log.warning( 3383 f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file" 3384 ) 3385 if ( 3386 annotation_fields_new_name 3387 in self.get_header().infos 3388 ): 3389 log.warning( 3390 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)" 3391 ) 3392 3393 log.info( 3394 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3395 ) 3396 3397 annotation_infos = ",".join(annotation_list) 3398 3399 if annotation_infos != "": 3400 3401 # Annotated VCF (and error file) 3402 tmp_annotation_vcf_name = os.path.join( 3403 tmp_dir, os.path.basename(annotation) + ".vcf.gz" 3404 ) 3405 tmp_annotation_vcf_name_err = ( 3406 tmp_annotation_vcf_name + ".err" 3407 ) 3408 3409 # Add fields to annotate 3410 if not annotation_fields_full: 3411 annotation_infos_option = f"-info {annotation_infos}" 3412 else: 3413 annotation_infos_option = "" 3414 3415 # Info fields rename 3416 if annotation_infos_rename_list: 3417 annotation_infos_rename = " -c " + ",".join( 3418 annotation_infos_rename_list 3419 ) 3420 else: 3421 annotation_infos_rename = "" 3422 3423 # Annotate command 3424 command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3425 3426 # Add command 3427 commands[command_annotate] = tmp_annotation_vcf_name 3428 3429 if commands: 3430 3431 # Export VCF file 3432 self.export_variant_vcf( 3433 vcf_file=tmp_vcf_name, 3434 remove_info=True, 3435 add_samples=False, 3436 index=True, 3437 ) 3438 shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf") 3439 3440 # Num command 3441 nb_command = 0 3442 3443 # Annotate 3444 for command_annotate in commands: 3445 nb_command += 1 3446 log.info( 3447 f"Annotation - Annotate [{nb_command}/{len(commands)}]..." 3448 ) 3449 log.debug(f"command_annotate={command_annotate}") 3450 run_parallel_commands([command_annotate], threads) 3451 3452 # Debug 3453 shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf") 3454 3455 # Update variants 3456 log.info( 3457 f"Annotation - Updating [{nb_command}/{len(commands)}]..." 3458 ) 3459 self.update_from_vcf(commands[command_annotate]) 3460 3461 def annotation_bcftools(self, threads: int = None) -> None: 3462 """ 3463 This function annotate with bcftools 3464 3465 :param threads: Number of threads to use 3466 :return: the value of the variable "return_value". 3467 """ 3468 3469 # DEBUG 3470 log.debug("Start annotation with bcftools databases") 3471 3472 # Threads 3473 if not threads: 3474 threads = self.get_threads() 3475 log.debug("Threads: " + str(threads)) 3476 3477 # Config 3478 config = self.get_config() 3479 log.debug("Config: " + str(config)) 3480 3481 # DEBUG 3482 delete_tmp = True 3483 if self.get_config().get("verbosity", "warning") in ["debug"]: 3484 delete_tmp = False 3485 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 3486 3487 # Config - BCFTools bin command 3488 bcftools_bin_command = get_bin_command( 3489 bin="bcftools", 3490 tool="bcftools", 3491 bin_type="bin", 3492 config=config, 3493 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3494 ) 3495 if not bcftools_bin_command: 3496 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3497 log.error(msg_err) 3498 raise ValueError(msg_err) 3499 3500 # Config - BCFTools databases folders 3501 databases_folders = set( 3502 self.get_config() 3503 .get("folders", {}) 3504 .get("databases", {}) 3505 .get("annotations", ["."]) 3506 + self.get_config() 3507 .get("folders", {}) 3508 .get("databases", {}) 3509 .get("bcftools", ["."]) 3510 ) 3511 log.debug("Databases annotations: " + str(databases_folders)) 3512 3513 # Param 3514 annotations = ( 3515 self.get_param() 3516 .get("annotation", {}) 3517 .get("bcftools", {}) 3518 .get("annotations", None) 3519 ) 3520 log.debug("Annotations: " + str(annotations)) 3521 3522 # Assembly 3523 assembly = self.get_param().get( 3524 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3525 ) 3526 3527 # Data 3528 table_variants = self.get_table_variants() 3529 3530 # Check if not empty 3531 log.debug("Check if not empty") 3532 sql_query_chromosomes = ( 3533 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3534 ) 3535 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3536 if not sql_query_chromosomes_df["count"][0]: 3537 log.info(f"VCF empty") 3538 return 3539 3540 # Export in VCF 3541 log.debug("Create initial file to annotate") 3542 tmp_vcf = NamedTemporaryFile( 3543 prefix=self.get_prefix(), 3544 dir=self.get_tmp_dir(), 3545 suffix=".vcf.gz", 3546 delete=False, 3547 ) 3548 tmp_vcf_name = tmp_vcf.name 3549 3550 # VCF header 3551 vcf_reader = self.get_header() 3552 log.debug("Initial header: " + str(vcf_reader.infos)) 3553 3554 # Existing annotations 3555 for vcf_annotation in self.get_header().infos: 3556 3557 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3558 log.debug( 3559 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3560 ) 3561 3562 if annotations: 3563 3564 tmp_ann_vcf_list = [] 3565 commands = [] 3566 tmp_files = [] 3567 err_files = [] 3568 3569 for annotation in annotations: 3570 annotation_fields = annotations[annotation] 3571 3572 # Annotation Name 3573 annotation_name = os.path.basename(annotation) 3574 3575 if not annotation_fields: 3576 annotation_fields = {"INFO": None} 3577 3578 log.debug(f"Annotation '{annotation_name}'") 3579 log.debug( 3580 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3581 ) 3582 3583 # Create Database 3584 database = Database( 3585 database=annotation, 3586 databases_folders=databases_folders, 3587 assembly=assembly, 3588 ) 3589 3590 # Find files 3591 db_file = database.get_database() 3592 db_file = full_path(db_file) 3593 db_hdr_file = database.get_header_file() 3594 db_hdr_file = full_path(db_hdr_file) 3595 db_file_type = database.get_format() 3596 db_tbi_file = f"{db_file}.tbi" 3597 db_file_compressed = database.is_compressed() 3598 3599 # Check if compressed 3600 if not db_file_compressed: 3601 log.error( 3602 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3603 ) 3604 raise ValueError( 3605 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3606 ) 3607 3608 # Check if indexed 3609 if not os.path.exists(db_tbi_file): 3610 log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file") 3611 raise ValueError( 3612 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3613 ) 3614 3615 # Check index - try to create if not exists 3616 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3617 log.error("Annotation failed: database not valid") 3618 log.error(f"Annotation annotation file: {db_file}") 3619 log.error(f"Annotation annotation header: {db_hdr_file}") 3620 log.error(f"Annotation annotation index: {db_tbi_file}") 3621 raise ValueError( 3622 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3623 ) 3624 else: 3625 3626 log.debug( 3627 f"Annotation '{annotation}' - file: " 3628 + str(db_file) 3629 + " and " 3630 + str(db_hdr_file) 3631 ) 3632 3633 # Load header as VCF object 3634 db_hdr_vcf = Variants(input=db_hdr_file) 3635 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3636 log.debug( 3637 "Annotation database header: " + str(db_hdr_vcf_header_infos) 3638 ) 3639 3640 # For all fields in database 3641 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3642 annotation_fields = { 3643 key: key for key in db_hdr_vcf_header_infos 3644 } 3645 log.debug( 3646 "Annotation database header - All annotations added: " 3647 + str(annotation_fields) 3648 ) 3649 3650 # Number of fields 3651 nb_annotation_field = 0 3652 annotation_list = [] 3653 3654 for annotation_field in annotation_fields: 3655 3656 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3657 annotation_fields_new_name = annotation_fields.get( 3658 annotation_field, annotation_field 3659 ) 3660 if not annotation_fields_new_name: 3661 annotation_fields_new_name = annotation_field 3662 3663 # Check if field is in DB and if field is not elready in input data 3664 if ( 3665 annotation_field in db_hdr_vcf.get_header().infos 3666 and annotation_fields_new_name 3667 not in self.get_header().infos 3668 ): 3669 3670 log.info( 3671 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3672 ) 3673 3674 # Add INFO field to header 3675 db_hdr_vcf_header_infos_number = ( 3676 db_hdr_vcf_header_infos[annotation_field].num or "." 3677 ) 3678 db_hdr_vcf_header_infos_type = ( 3679 db_hdr_vcf_header_infos[annotation_field].type 3680 or "String" 3681 ) 3682 db_hdr_vcf_header_infos_description = ( 3683 db_hdr_vcf_header_infos[annotation_field].desc 3684 or f"{annotation_field} description" 3685 ) 3686 db_hdr_vcf_header_infos_source = ( 3687 db_hdr_vcf_header_infos[annotation_field].source 3688 or "unknown" 3689 ) 3690 db_hdr_vcf_header_infos_version = ( 3691 db_hdr_vcf_header_infos[annotation_field].version 3692 or "unknown" 3693 ) 3694 3695 vcf_reader.infos[annotation_fields_new_name] = ( 3696 vcf.parser._Info( 3697 annotation_fields_new_name, 3698 db_hdr_vcf_header_infos_number, 3699 db_hdr_vcf_header_infos_type, 3700 db_hdr_vcf_header_infos_description, 3701 db_hdr_vcf_header_infos_source, 3702 db_hdr_vcf_header_infos_version, 3703 self.code_type_map[db_hdr_vcf_header_infos_type], 3704 ) 3705 ) 3706 3707 # annotation_list.append(annotation_field) 3708 if annotation_field != annotation_fields_new_name: 3709 annotation_list.append( 3710 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3711 ) 3712 else: 3713 annotation_list.append(annotation_field) 3714 3715 nb_annotation_field += 1 3716 3717 else: 3718 3719 if annotation_field not in db_hdr_vcf.get_header().infos: 3720 log.warning( 3721 f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file" 3722 ) 3723 if annotation_fields_new_name in self.get_header().infos: 3724 log.warning( 3725 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 3726 ) 3727 3728 log.info( 3729 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3730 ) 3731 3732 annotation_infos = ",".join(annotation_list) 3733 3734 if annotation_infos != "": 3735 3736 # Protect header for bcftools (remove "#CHROM" and variants line) 3737 log.debug("Protect Header file - remove #CHROM line if exists") 3738 tmp_header_vcf = NamedTemporaryFile( 3739 prefix=self.get_prefix(), 3740 dir=self.get_tmp_dir(), 3741 suffix=".hdr", 3742 delete=False, 3743 ) 3744 tmp_header_vcf_name = tmp_header_vcf.name 3745 tmp_files.append(tmp_header_vcf_name) 3746 # Command 3747 if db_hdr_file.endswith(".gz"): 3748 command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 3749 else: 3750 command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 3751 # Run 3752 run_parallel_commands([command_extract_header], 1) 3753 3754 # Find chomosomes 3755 log.debug("Find chromosomes ") 3756 sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\"""" 3757 sql_query_chromosomes_df = self.get_query_to_df( 3758 sql_query_chromosomes 3759 ) 3760 chomosomes_list = list(sql_query_chromosomes_df["CHROM"]) 3761 3762 log.debug("Chromosomes found: " + str(list(chomosomes_list))) 3763 3764 # BED columns in the annotation file 3765 if db_file_type in ["bed"]: 3766 annotation_infos = "CHROM,POS,POS," + annotation_infos 3767 3768 for chrom in chomosomes_list: 3769 3770 # Create BED on initial VCF 3771 log.debug("Create BED on initial VCF: " + str(tmp_vcf_name)) 3772 tmp_bed = NamedTemporaryFile( 3773 prefix=self.get_prefix(), 3774 dir=self.get_tmp_dir(), 3775 suffix=".bed", 3776 delete=False, 3777 ) 3778 tmp_bed_name = tmp_bed.name 3779 tmp_files.append(tmp_bed_name) 3780 3781 # Detecte regions 3782 log.debug( 3783 f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..." 3784 ) 3785 window = 1000000 3786 sql_query_intervals_for_bed = f""" 3787 SELECT \"#CHROM\", 3788 CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END, 3789 \"POS\"+{window} 3790 FROM {table_variants} as table_variants 3791 WHERE table_variants.\"#CHROM\" = '{chrom}' 3792 """ 3793 regions = self.conn.execute( 3794 sql_query_intervals_for_bed 3795 ).fetchall() 3796 merged_regions = merge_regions(regions) 3797 log.debug( 3798 f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..." 3799 ) 3800 3801 header = ["#CHROM", "START", "END"] 3802 with open(tmp_bed_name, "w") as f: 3803 # Write the header with tab delimiter 3804 f.write("\t".join(header) + "\n") 3805 for d in merged_regions: 3806 # Write each data row with tab delimiter 3807 f.write("\t".join(map(str, d)) + "\n") 3808 3809 # Tmp files 3810 tmp_annotation_vcf = NamedTemporaryFile( 3811 prefix=self.get_prefix(), 3812 dir=self.get_tmp_dir(), 3813 suffix=".vcf.gz", 3814 delete=False, 3815 ) 3816 tmp_annotation_vcf_name = tmp_annotation_vcf.name 3817 tmp_files.append(tmp_annotation_vcf_name) 3818 tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}") 3819 tmp_annotation_vcf_name_err = ( 3820 tmp_annotation_vcf_name + ".err" 3821 ) 3822 err_files.append(tmp_annotation_vcf_name_err) 3823 3824 # Annotate Command 3825 log.debug( 3826 f"Annotation '{annotation}' - add bcftools command" 3827 ) 3828 3829 # Command 3830 command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3831 3832 # Add command 3833 commands.append(command_annotate) 3834 3835 # if some commands 3836 if commands: 3837 3838 # Export VCF file 3839 self.export_variant_vcf( 3840 vcf_file=tmp_vcf_name, 3841 remove_info=True, 3842 add_samples=False, 3843 index=True, 3844 ) 3845 3846 # Threads 3847 # calculate threads for annotated commands 3848 if commands: 3849 threads_bcftools_annotate = round(threads / len(commands)) 3850 else: 3851 threads_bcftools_annotate = 1 3852 3853 if not threads_bcftools_annotate: 3854 threads_bcftools_annotate = 1 3855 3856 # Add threads option to bcftools commands 3857 if threads_bcftools_annotate > 1: 3858 commands_threaded = [] 3859 for command in commands: 3860 commands_threaded.append( 3861 command.replace( 3862 f"{bcftools_bin_command} annotate ", 3863 f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ", 3864 ) 3865 ) 3866 commands = commands_threaded 3867 3868 # Command annotation multithreading 3869 log.debug(f"Annotation - Annotation commands: " + str(commands)) 3870 log.info( 3871 f"Annotation - Annotation multithreaded in " 3872 + str(len(commands)) 3873 + " commands" 3874 ) 3875 3876 run_parallel_commands(commands, threads) 3877 3878 # Merge 3879 tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list) 3880 3881 if tmp_ann_vcf_list_cmd: 3882 3883 # Tmp file 3884 tmp_annotate_vcf = NamedTemporaryFile( 3885 prefix=self.get_prefix(), 3886 dir=self.get_tmp_dir(), 3887 suffix=".vcf.gz", 3888 delete=True, 3889 ) 3890 tmp_annotate_vcf_name = tmp_annotate_vcf.name 3891 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 3892 err_files.append(tmp_annotate_vcf_name_err) 3893 3894 # Tmp file remove command 3895 tmp_files_remove_command = "" 3896 if tmp_files: 3897 tmp_files_remove_command = " && rm -f " + " ".join(tmp_files) 3898 3899 # Command merge 3900 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}" 3901 log.info( 3902 f"Annotation - Annotation merging " 3903 + str(len(commands)) 3904 + " annotated files" 3905 ) 3906 log.debug(f"Annotation - merge command: {merge_command}") 3907 run_parallel_commands([merge_command], 1) 3908 3909 # Error messages 3910 log.info(f"Error/Warning messages:") 3911 error_message_command_all = [] 3912 error_message_command_warning = [] 3913 error_message_command_err = [] 3914 for err_file in err_files: 3915 with open(err_file, "r") as f: 3916 for line in f: 3917 message = line.strip() 3918 error_message_command_all.append(message) 3919 if line.startswith("[W::"): 3920 error_message_command_warning.append(message) 3921 if line.startswith("[E::"): 3922 error_message_command_err.append( 3923 f"{err_file}: " + message 3924 ) 3925 # log info 3926 for message in list( 3927 set(error_message_command_err + error_message_command_warning) 3928 ): 3929 log.info(f" {message}") 3930 # debug info 3931 for message in list(set(error_message_command_all)): 3932 log.debug(f" {message}") 3933 # failed 3934 if len(error_message_command_err): 3935 log.error("Annotation failed: Error in commands") 3936 raise ValueError("Annotation failed: Error in commands") 3937 3938 # Update variants 3939 log.info(f"Annotation - Updating...") 3940 self.update_from_vcf(tmp_annotate_vcf_name) 3941 3942 def annotation_exomiser(self, threads: int = None) -> None: 3943 """ 3944 This function annotate with Exomiser 3945 3946 This function uses args as parameters, in section "annotation" -> "exomiser", with sections: 3947 - "analysis" (dict/file): 3948 Full analysis dictionnary parameters (see Exomiser docs). 3949 Either a dict, or a file in JSON or YAML format. 3950 These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) 3951 Default : None 3952 - "preset" (string): 3953 Analysis preset (available in config folder). 3954 Used if no full "analysis" is provided. 3955 Default: "exome" 3956 - "phenopacket" (dict/file): 3957 Samples and phenotipic features parameters (see Exomiser docs). 3958 Either a dict, or a file in JSON or YAML format. 3959 Default: None 3960 - "subject" (dict): 3961 Sample parameters (see Exomiser docs). 3962 Example: 3963 "subject": 3964 { 3965 "id": "ISDBM322017", 3966 "sex": "FEMALE" 3967 } 3968 Default: None 3969 - "sample" (string): 3970 Sample name to construct "subject" section: 3971 "subject": 3972 { 3973 "id": "<sample>", 3974 "sex": "UNKNOWN_SEX" 3975 } 3976 Default: None 3977 - "phenotypicFeatures" (dict) 3978 Phenotypic features to construct "subject" section. 3979 Example: 3980 "phenotypicFeatures": 3981 [ 3982 { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, 3983 { "type": { "id": "HP:0000486", "label": "Strabismus" } } 3984 ] 3985 - "hpo" (list) 3986 List of HPO ids as phenotypic features. 3987 Example: 3988 "hpo": ['0001156', '0001363', '0011304', '0010055'] 3989 Default: [] 3990 - "outputOptions" (dict): 3991 Output options (see Exomiser docs). 3992 Default: 3993 "output_options" = 3994 { 3995 "outputContributingVariantsOnly": False, 3996 "numGenes": 0, 3997 "outputFormats": ["TSV_VARIANT", "VCF"] 3998 } 3999 - "transcript_source" (string): 4000 Transcript source (either "refseq", "ucsc", "ensembl") 4001 Default: "refseq" 4002 - "exomiser_to_info" (boolean): 4003 Add exomiser TSV file columns as INFO fields in VCF. 4004 Default: False 4005 - "release" (string): 4006 Exomise database release. 4007 If not exists, database release will be downloaded (take a while). 4008 Default: None (provided by application.properties configuration file) 4009 - "exomiser_application_properties" (file): 4010 Exomiser configuration file (see Exomiser docs). 4011 Useful to automatically download databases (especially for specific genome databases). 4012 4013 Notes: 4014 - If no sample in parameters, first sample in VCF will be chosen 4015 - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off 4016 4017 :param threads: The number of threads to use 4018 :return: None. 4019 """ 4020 4021 # DEBUG 4022 log.debug("Start annotation with Exomiser databases") 4023 4024 # Threads 4025 if not threads: 4026 threads = self.get_threads() 4027 log.debug("Threads: " + str(threads)) 4028 4029 # Config 4030 config = self.get_config() 4031 log.debug("Config: " + str(config)) 4032 4033 # Config - Folders - Databases 4034 databases_folders = ( 4035 config.get("folders", {}) 4036 .get("databases", {}) 4037 .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current") 4038 ) 4039 databases_folders = full_path(databases_folders) 4040 if not os.path.exists(databases_folders): 4041 log.error(f"Databases annotations: {databases_folders} NOT found") 4042 log.debug("Databases annotations: " + str(databases_folders)) 4043 4044 # Config - Exomiser 4045 exomiser_bin_command = get_bin_command( 4046 bin="exomiser-cli*.jar", 4047 tool="exomiser", 4048 bin_type="jar", 4049 config=config, 4050 default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser", 4051 ) 4052 log.debug("Exomiser bin command: " + str(exomiser_bin_command)) 4053 if not exomiser_bin_command: 4054 msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'" 4055 log.error(msg_err) 4056 raise ValueError(msg_err) 4057 4058 # Param 4059 param = self.get_param() 4060 log.debug("Param: " + str(param)) 4061 4062 # Param - Exomiser 4063 param_exomiser = param.get("annotation", {}).get("exomiser", {}) 4064 log.debug(f"Param Exomiser: {param_exomiser}") 4065 4066 # Param - Assembly 4067 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4068 log.debug("Assembly: " + str(assembly)) 4069 4070 # Data 4071 table_variants = self.get_table_variants() 4072 4073 # Check if not empty 4074 log.debug("Check if not empty") 4075 sql_query_chromosomes = ( 4076 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4077 ) 4078 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4079 log.info(f"VCF empty") 4080 return False 4081 4082 # VCF header 4083 vcf_reader = self.get_header() 4084 log.debug("Initial header: " + str(vcf_reader.infos)) 4085 4086 # Samples 4087 samples = self.get_header_sample_list() 4088 if not samples: 4089 log.error("No Samples in VCF") 4090 return False 4091 log.debug(f"Samples: {samples}") 4092 4093 # Memory limit 4094 memory_limit = self.get_memory("8G") 4095 log.debug(f"memory_limit: {memory_limit}") 4096 4097 # Exomiser java options 4098 exomiser_java_options = ( 4099 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4100 ) 4101 log.debug(f"Exomiser java options: {exomiser_java_options}") 4102 4103 # Download Exomiser (if not exists) 4104 exomiser_release = param_exomiser.get("release", None) 4105 exomiser_application_properties = param_exomiser.get( 4106 "exomiser_application_properties", None 4107 ) 4108 databases_download_exomiser( 4109 assemblies=[assembly], 4110 exomiser_folder=databases_folders, 4111 exomiser_release=exomiser_release, 4112 exomiser_phenotype_release=exomiser_release, 4113 exomiser_application_properties=exomiser_application_properties, 4114 ) 4115 4116 # Force annotation 4117 force_update_annotation = True 4118 4119 if "Exomiser" not in self.get_header().infos or force_update_annotation: 4120 log.debug("Start annotation Exomiser") 4121 4122 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 4123 4124 # tmp_dir = "/tmp/exomiser" 4125 4126 ### ANALYSIS ### 4127 ################ 4128 4129 # Create analysis.json through analysis dict 4130 # either analysis in param or by default 4131 # depending on preset exome/genome) 4132 4133 # Init analysis dict 4134 param_exomiser_analysis_dict = {} 4135 4136 # analysis from param 4137 param_exomiser_analysis = param_exomiser.get("analysis", {}) 4138 param_exomiser_analysis = full_path(param_exomiser_analysis) 4139 4140 # If analysis in param -> load anlaysis json 4141 if param_exomiser_analysis: 4142 4143 # If param analysis is a file and exists 4144 if isinstance(param_exomiser_analysis, str) and os.path.exists( 4145 param_exomiser_analysis 4146 ): 4147 # Load analysis file into analysis dict (either yaml or json) 4148 with open(param_exomiser_analysis) as json_file: 4149 param_exomiser_analysis_dict = yaml.safe_load(json_file) 4150 4151 # If param analysis is a dict 4152 elif isinstance(param_exomiser_analysis, dict): 4153 # Load analysis dict into analysis dict (either yaml or json) 4154 param_exomiser_analysis_dict = param_exomiser_analysis 4155 4156 # Error analysis type 4157 else: 4158 log.error(f"Analysis type unknown. Check param file.") 4159 raise ValueError(f"Analysis type unknown. Check param file.") 4160 4161 # Case no input analysis config file/dict 4162 # Use preset (exome/genome) to open default config file 4163 if not param_exomiser_analysis_dict: 4164 4165 # default preset 4166 default_preset = "exome" 4167 4168 # Get param preset or default preset 4169 param_exomiser_preset = param_exomiser.get("preset", default_preset) 4170 4171 # Try to find if preset is a file 4172 if os.path.exists(param_exomiser_preset): 4173 # Preset file is provided in full path 4174 param_exomiser_analysis_default_config_file = ( 4175 param_exomiser_preset 4176 ) 4177 # elif os.path.exists(full_path(param_exomiser_preset)): 4178 # # Preset file is provided in full path 4179 # param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset) 4180 elif os.path.exists( 4181 os.path.join(folder_config, param_exomiser_preset) 4182 ): 4183 # Preset file is provided a basename in config folder (can be a path with subfolders) 4184 param_exomiser_analysis_default_config_file = os.path.join( 4185 folder_config, param_exomiser_preset 4186 ) 4187 else: 4188 # Construct preset file 4189 param_exomiser_analysis_default_config_file = os.path.join( 4190 folder_config, 4191 f"preset-{param_exomiser_preset}-analysis.json", 4192 ) 4193 4194 # If preset file exists 4195 param_exomiser_analysis_default_config_file = full_path( 4196 param_exomiser_analysis_default_config_file 4197 ) 4198 if os.path.exists(param_exomiser_analysis_default_config_file): 4199 # Load prest file into analysis dict (either yaml or json) 4200 with open( 4201 param_exomiser_analysis_default_config_file 4202 ) as json_file: 4203 # param_exomiser_analysis_dict[""] = json.load(json_file) 4204 param_exomiser_analysis_dict["analysis"] = yaml.safe_load( 4205 json_file 4206 ) 4207 4208 # Error preset file 4209 else: 4210 log.error( 4211 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4212 ) 4213 raise ValueError( 4214 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4215 ) 4216 4217 # If no analysis dict created 4218 if not param_exomiser_analysis_dict: 4219 log.error(f"No analysis config") 4220 raise ValueError(f"No analysis config") 4221 4222 # Log 4223 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4224 4225 ### PHENOPACKET ### 4226 ################### 4227 4228 # If no PhenoPacket in analysis dict -> check in param 4229 if "phenopacket" not in param_exomiser_analysis_dict: 4230 4231 # If PhenoPacket in param -> load anlaysis json 4232 if param_exomiser.get("phenopacket", None): 4233 4234 param_exomiser_phenopacket = param_exomiser.get("phenopacket") 4235 param_exomiser_phenopacket = full_path( 4236 param_exomiser_phenopacket 4237 ) 4238 4239 # If param phenopacket is a file and exists 4240 if isinstance( 4241 param_exomiser_phenopacket, str 4242 ) and os.path.exists(param_exomiser_phenopacket): 4243 # Load phenopacket file into analysis dict (either yaml or json) 4244 with open(param_exomiser_phenopacket) as json_file: 4245 param_exomiser_analysis_dict["phenopacket"] = ( 4246 yaml.safe_load(json_file) 4247 ) 4248 4249 # If param phenopacket is a dict 4250 elif isinstance(param_exomiser_phenopacket, dict): 4251 # Load phenopacket dict into analysis dict (either yaml or json) 4252 param_exomiser_analysis_dict["phenopacket"] = ( 4253 param_exomiser_phenopacket 4254 ) 4255 4256 # Error phenopacket type 4257 else: 4258 log.error(f"Phenopacket type unknown. Check param file.") 4259 raise ValueError( 4260 f"Phenopacket type unknown. Check param file." 4261 ) 4262 4263 # If no PhenoPacket in analysis dict -> construct from sample and HPO in param 4264 if "phenopacket" not in param_exomiser_analysis_dict: 4265 4266 # Init PhenoPacket 4267 param_exomiser_analysis_dict["phenopacket"] = { 4268 "id": "analysis", 4269 "proband": {}, 4270 } 4271 4272 ### Add subject ### 4273 4274 # If subject exists 4275 param_exomiser_subject = param_exomiser.get("subject", {}) 4276 4277 # If subject not exists -> found sample ID 4278 if not param_exomiser_subject: 4279 4280 # Found sample ID in param 4281 sample = param_exomiser.get("sample", None) 4282 4283 # Find sample ID (first sample) 4284 if not sample: 4285 sample_list = self.get_header_sample_list() 4286 if len(sample_list) > 0: 4287 sample = sample_list[0] 4288 else: 4289 log.error(f"No sample found") 4290 raise ValueError(f"No sample found") 4291 4292 # Create subject 4293 param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"} 4294 4295 # Add to dict 4296 param_exomiser_analysis_dict["phenopacket"][ 4297 "subject" 4298 ] = param_exomiser_subject 4299 4300 ### Add "phenotypicFeatures" ### 4301 4302 # If phenotypicFeatures exists 4303 param_exomiser_phenotypicfeatures = param_exomiser.get( 4304 "phenotypicFeatures", [] 4305 ) 4306 4307 # If phenotypicFeatures not exists -> Try to infer from hpo list 4308 if not param_exomiser_phenotypicfeatures: 4309 4310 # Found HPO in param 4311 param_exomiser_hpo = param_exomiser.get("hpo", []) 4312 4313 # Split HPO if list in string format separated by comma 4314 if isinstance(param_exomiser_hpo, str): 4315 param_exomiser_hpo = param_exomiser_hpo.split(",") 4316 4317 # Create HPO list 4318 for hpo in param_exomiser_hpo: 4319 hpo_clean = re.sub("[^0-9]", "", hpo) 4320 param_exomiser_phenotypicfeatures.append( 4321 { 4322 "type": { 4323 "id": f"HP:{hpo_clean}", 4324 "label": f"HP:{hpo_clean}", 4325 } 4326 } 4327 ) 4328 4329 # Add to dict 4330 param_exomiser_analysis_dict["phenopacket"][ 4331 "phenotypicFeatures" 4332 ] = param_exomiser_phenotypicfeatures 4333 4334 # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step 4335 if not param_exomiser_phenotypicfeatures: 4336 for step in param_exomiser_analysis_dict.get( 4337 "analysis", {} 4338 ).get("steps", []): 4339 if "hiPhivePrioritiser" in step: 4340 param_exomiser_analysis_dict.get("analysis", {}).get( 4341 "steps", [] 4342 ).remove(step) 4343 4344 ### Add Input File ### 4345 4346 # Initial file name and htsFiles 4347 tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz") 4348 param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [ 4349 { 4350 "uri": tmp_vcf_name, 4351 "htsFormat": "VCF", 4352 "genomeAssembly": assembly, 4353 } 4354 ] 4355 4356 ### Add metaData ### 4357 4358 # If metaData not in analysis dict 4359 if "metaData" not in param_exomiser_analysis_dict: 4360 param_exomiser_analysis_dict["phenopacket"]["metaData"] = { 4361 "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z", 4362 "createdBy": "howard", 4363 "phenopacketSchemaVersion": 1, 4364 } 4365 4366 ### OutputOptions ### 4367 4368 # Init output result folder 4369 output_results = os.path.join(tmp_dir, "results") 4370 4371 # If no outputOptions in analysis dict 4372 if "outputOptions" not in param_exomiser_analysis_dict: 4373 4374 # default output formats 4375 defaut_output_formats = ["TSV_VARIANT", "VCF"] 4376 4377 # Get outputOptions in param 4378 output_options = param_exomiser.get("outputOptions", None) 4379 4380 # If no output_options in param -> check 4381 if not output_options: 4382 output_options = { 4383 "outputContributingVariantsOnly": False, 4384 "numGenes": 0, 4385 "outputFormats": defaut_output_formats, 4386 } 4387 4388 # Replace outputDirectory in output options 4389 output_options["outputDirectory"] = output_results 4390 output_options["outputFileName"] = "howard" 4391 4392 # Add outputOptions in analysis dict 4393 param_exomiser_analysis_dict["outputOptions"] = output_options 4394 4395 else: 4396 4397 # Replace output_results and output format (if exists in param) 4398 param_exomiser_analysis_dict["outputOptions"][ 4399 "outputDirectory" 4400 ] = output_results 4401 param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = ( 4402 list( 4403 set( 4404 param_exomiser_analysis_dict.get( 4405 "outputOptions", {} 4406 ).get("outputFormats", []) 4407 + ["TSV_VARIANT", "VCF"] 4408 ) 4409 ) 4410 ) 4411 4412 # log 4413 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4414 4415 ### ANALYSIS FILE ### 4416 ##################### 4417 4418 ### Full JSON analysis config file ### 4419 4420 exomiser_analysis = os.path.join(tmp_dir, "analysis.json") 4421 with open(exomiser_analysis, "w") as fp: 4422 json.dump(param_exomiser_analysis_dict, fp, indent=4) 4423 4424 ### SPLIT analysis and sample config files 4425 4426 # Splitted analysis dict 4427 param_exomiser_analysis_dict_for_split = ( 4428 param_exomiser_analysis_dict.copy() 4429 ) 4430 4431 # Phenopacket JSON file 4432 exomiser_analysis_phenopacket = os.path.join( 4433 tmp_dir, "analysis_phenopacket.json" 4434 ) 4435 with open(exomiser_analysis_phenopacket, "w") as fp: 4436 json.dump( 4437 param_exomiser_analysis_dict_for_split.get("phenopacket"), 4438 fp, 4439 indent=4, 4440 ) 4441 4442 # Analysis JSON file without Phenopacket parameters 4443 param_exomiser_analysis_dict_for_split.pop("phenopacket") 4444 exomiser_analysis_analysis = os.path.join( 4445 tmp_dir, "analysis_analysis.json" 4446 ) 4447 with open(exomiser_analysis_analysis, "w") as fp: 4448 json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4) 4449 4450 ### INITAL VCF file ### 4451 ####################### 4452 4453 ### Create list of samples to use and include inti initial VCF file #### 4454 4455 # Subject (main sample) 4456 # Get sample ID in analysis dict 4457 sample_subject = ( 4458 param_exomiser_analysis_dict.get("phenopacket", {}) 4459 .get("subject", {}) 4460 .get("id", None) 4461 ) 4462 sample_proband = ( 4463 param_exomiser_analysis_dict.get("phenopacket", {}) 4464 .get("proband", {}) 4465 .get("subject", {}) 4466 .get("id", None) 4467 ) 4468 sample = [] 4469 if sample_subject: 4470 sample.append(sample_subject) 4471 if sample_proband: 4472 sample.append(sample_proband) 4473 4474 # Get sample ID within Pedigree 4475 pedigree_persons_list = ( 4476 param_exomiser_analysis_dict.get("phenopacket", {}) 4477 .get("pedigree", {}) 4478 .get("persons", {}) 4479 ) 4480 4481 # Create list with all sample ID in pedigree (if exists) 4482 pedigree_persons = [] 4483 for person in pedigree_persons_list: 4484 pedigree_persons.append(person.get("individualId")) 4485 4486 # Concat subject sample ID and samples ID in pedigreesamples 4487 samples = list(set(sample + pedigree_persons)) 4488 4489 # Check if sample list is not empty 4490 if not samples: 4491 log.error(f"No samples found") 4492 raise ValueError(f"No samples found") 4493 4494 # Create VCF with sample (either sample in param or first one by default) 4495 # Export VCF file 4496 self.export_variant_vcf( 4497 vcf_file=tmp_vcf_name, 4498 remove_info=True, 4499 add_samples=True, 4500 list_samples=samples, 4501 index=False, 4502 ) 4503 4504 ### Execute Exomiser ### 4505 ######################## 4506 4507 # Init command 4508 exomiser_command = "" 4509 4510 # Command exomiser options 4511 exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} " 4512 4513 # Release 4514 exomiser_release = param_exomiser.get("release", None) 4515 if exomiser_release: 4516 # phenotype data version 4517 exomiser_options += ( 4518 f" --exomiser.phenotype.data-version={exomiser_release} " 4519 ) 4520 # data version 4521 exomiser_options += ( 4522 f" --exomiser.{assembly}.data-version={exomiser_release} " 4523 ) 4524 # variant white list 4525 variant_white_list_file = ( 4526 f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz" 4527 ) 4528 if os.path.exists( 4529 os.path.join( 4530 databases_folders, assembly, variant_white_list_file 4531 ) 4532 ): 4533 exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} " 4534 4535 # transcript_source 4536 transcript_source = param_exomiser.get( 4537 "transcript_source", None 4538 ) # ucsc, refseq, ensembl 4539 if transcript_source: 4540 exomiser_options += ( 4541 f" --exomiser.{assembly}.transcript-source={transcript_source} " 4542 ) 4543 4544 # If analysis contain proband param 4545 if param_exomiser_analysis_dict.get("phenopacket", {}).get( 4546 "proband", {} 4547 ): 4548 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} " 4549 4550 # If no proband (usually uniq sample) 4551 else: 4552 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}" 4553 4554 # Log 4555 log.debug(f"exomiser_command_analysis={exomiser_command_analysis}") 4556 4557 # Run command 4558 result = subprocess.call( 4559 exomiser_command_analysis.split(), stdout=subprocess.PIPE 4560 ) 4561 if result: 4562 log.error("Exomiser command failed") 4563 raise ValueError("Exomiser command failed") 4564 4565 ### RESULTS ### 4566 ############### 4567 4568 ### Annotate with TSV fields ### 4569 4570 # Init result tsv file 4571 exomiser_to_info = param_exomiser.get("exomiser_to_info", False) 4572 4573 # Init result tsv file 4574 output_results_tsv = os.path.join(output_results, "howard.variants.tsv") 4575 4576 # Parse TSV file and explode columns in INFO field 4577 if exomiser_to_info and os.path.exists(output_results_tsv): 4578 4579 # Log 4580 log.debug("Exomiser columns to VCF INFO field") 4581 4582 # Retrieve columns and types 4583 query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """ 4584 output_results_tsv_df = self.get_query_to_df(query) 4585 output_results_tsv_columns = output_results_tsv_df.columns.tolist() 4586 4587 # Init concat fields for update 4588 sql_query_update_concat_fields = [] 4589 4590 # Fields to avoid 4591 fields_to_avoid = [ 4592 "CONTIG", 4593 "START", 4594 "END", 4595 "REF", 4596 "ALT", 4597 "QUAL", 4598 "FILTER", 4599 "GENOTYPE", 4600 ] 4601 4602 # List all columns to add into header 4603 for header_column in output_results_tsv_columns: 4604 4605 # If header column is enable 4606 if header_column not in fields_to_avoid: 4607 4608 # Header info type 4609 header_info_type = "String" 4610 header_column_df = output_results_tsv_df[header_column] 4611 header_column_df_dtype = header_column_df.dtype 4612 if header_column_df_dtype == object: 4613 if ( 4614 pd.to_numeric(header_column_df, errors="coerce") 4615 .notnull() 4616 .all() 4617 ): 4618 header_info_type = "Float" 4619 else: 4620 header_info_type = "Integer" 4621 4622 # Header info 4623 characters_to_validate = ["-"] 4624 pattern = "[" + "".join(characters_to_validate) + "]" 4625 header_info_name = re.sub( 4626 pattern, 4627 "_", 4628 f"Exomiser_{header_column}".replace("#", ""), 4629 ) 4630 header_info_number = "." 4631 header_info_description = ( 4632 f"Exomiser {header_column} annotation" 4633 ) 4634 header_info_source = "Exomiser" 4635 header_info_version = "unknown" 4636 header_info_code = CODE_TYPE_MAP[header_info_type] 4637 vcf_reader.infos[header_info_name] = vcf.parser._Info( 4638 header_info_name, 4639 header_info_number, 4640 header_info_type, 4641 header_info_description, 4642 header_info_source, 4643 header_info_version, 4644 header_info_code, 4645 ) 4646 4647 # Add field to add for update to concat fields 4648 sql_query_update_concat_fields.append( 4649 f""" 4650 CASE 4651 WHEN table_parquet."{header_column}" NOT IN ('','.') 4652 THEN concat( 4653 '{header_info_name}=', 4654 table_parquet."{header_column}", 4655 ';' 4656 ) 4657 4658 ELSE '' 4659 END 4660 """ 4661 ) 4662 4663 # Update query 4664 sql_query_update = f""" 4665 UPDATE {table_variants} as table_variants 4666 SET INFO = concat( 4667 CASE 4668 WHEN INFO NOT IN ('', '.') 4669 THEN INFO 4670 ELSE '' 4671 END, 4672 CASE 4673 WHEN table_variants.INFO NOT IN ('','.') 4674 THEN ';' 4675 ELSE '' 4676 END, 4677 ( 4678 SELECT 4679 concat( 4680 {",".join(sql_query_update_concat_fields)} 4681 ) 4682 FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet 4683 WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\" 4684 AND table_parquet.\"START\" = table_variants.\"POS\" 4685 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 4686 AND table_parquet.\"REF\" = table_variants.\"REF\" 4687 ) 4688 ) 4689 ; 4690 """ 4691 4692 # Update 4693 self.conn.execute(sql_query_update) 4694 4695 ### Annotate with VCF INFO field ### 4696 4697 # Init result VCF file 4698 output_results_vcf = os.path.join(output_results, "howard.vcf.gz") 4699 4700 # If VCF exists 4701 if os.path.exists(output_results_vcf): 4702 4703 # Log 4704 log.debug("Exomiser result VCF update variants") 4705 4706 # Find Exomiser INFO field annotation in header 4707 with gzip.open(output_results_vcf, "rt") as f: 4708 header_list = self.read_vcf_header(f) 4709 exomiser_vcf_header = vcf.Reader( 4710 io.StringIO("\n".join(header_list)) 4711 ) 4712 4713 # Add annotation INFO field to header 4714 vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"] 4715 4716 # Update variants with VCF 4717 self.update_from_vcf(output_results_vcf) 4718 4719 return True 4720 4721 def annotation_snpeff(self, threads: int = None) -> None: 4722 """ 4723 This function annotate with snpEff 4724 4725 :param threads: The number of threads to use 4726 :return: the value of the variable "return_value". 4727 """ 4728 4729 # DEBUG 4730 log.debug("Start annotation with snpeff databases") 4731 4732 # Threads 4733 if not threads: 4734 threads = self.get_threads() 4735 log.debug("Threads: " + str(threads)) 4736 4737 # DEBUG 4738 delete_tmp = True 4739 if self.get_config().get("verbosity", "warning") in ["debug"]: 4740 delete_tmp = False 4741 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4742 4743 # Config 4744 config = self.get_config() 4745 log.debug("Config: " + str(config)) 4746 4747 # Config - Folders - Databases 4748 databases_folders = ( 4749 config.get("folders", {}).get("databases", {}).get("snpeff", ["."]) 4750 ) 4751 log.debug("Databases annotations: " + str(databases_folders)) 4752 4753 # # Config - Java 4754 # java_bin = get_bin( 4755 # tool="java", 4756 # bin="java", 4757 # bin_type="bin", 4758 # config=config, 4759 # default_folder="/usr/bin", 4760 # ) 4761 # if not (os.path.exists(java_bin) or (java_bin and which(java_bin))): 4762 # log.error(f"Annotation failed: no java bin '{java_bin}'") 4763 # raise ValueError(f"Annotation failed: no java bin '{java_bin}'") 4764 4765 # # Config - snpEff bin 4766 # snpeff_jar = get_bin( 4767 # tool="snpeff", 4768 # bin="snpEff.jar", 4769 # bin_type="jar", 4770 # config=config, 4771 # default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 4772 # ) 4773 # if not (os.path.exists(snpeff_jar) or (snpeff_jar and which(snpeff_jar))): 4774 # log.error(f"Annotation failed: no snpEff jar '{snpeff_jar}'") 4775 # raise ValueError(f"Annotation failed: no snpEff jar '{snpeff_jar}'") 4776 4777 # Config - snpEff bin command 4778 snpeff_bin_command = get_bin_command( 4779 bin="snpEff.jar", 4780 tool="snpeff", 4781 bin_type="jar", 4782 config=config, 4783 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 4784 ) 4785 if not snpeff_bin_command: 4786 msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'" 4787 log.error(msg_err) 4788 raise ValueError(msg_err) 4789 4790 # Config - snpEff databases 4791 snpeff_databases = ( 4792 config.get("folders", {}) 4793 .get("databases", {}) 4794 .get("snpeff", DEFAULT_SNPEFF_FOLDER) 4795 ) 4796 snpeff_databases = full_path(snpeff_databases) 4797 if snpeff_databases is not None and snpeff_databases != "": 4798 log.debug(f"Create snpEff databases folder") 4799 if not os.path.exists(snpeff_databases): 4800 os.makedirs(snpeff_databases) 4801 4802 # Param 4803 param = self.get_param() 4804 log.debug("Param: " + str(param)) 4805 4806 # Param 4807 options = param.get("annotation", {}).get("snpeff", {}).get("options", None) 4808 log.debug("Options: " + str(options)) 4809 4810 # Param - Assembly 4811 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4812 4813 # Param - Options 4814 snpeff_options = ( 4815 param.get("annotation", {}).get("snpeff", {}).get("options", "") 4816 ) 4817 snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None) 4818 snpeff_csvstats = ( 4819 param.get("annotation", {}).get("snpeff", {}).get("csvStats", None) 4820 ) 4821 if snpeff_stats: 4822 snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output()) 4823 snpeff_stats = full_path(snpeff_stats) 4824 snpeff_options += f" -stats {snpeff_stats}" 4825 if snpeff_csvstats: 4826 snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output()) 4827 snpeff_csvstats = full_path(snpeff_csvstats) 4828 snpeff_options += f" -csvStats {snpeff_csvstats}" 4829 4830 # Data 4831 table_variants = self.get_table_variants() 4832 4833 # Check if not empty 4834 log.debug("Check if not empty") 4835 sql_query_chromosomes = ( 4836 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4837 ) 4838 # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]: 4839 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4840 log.info(f"VCF empty") 4841 return 4842 4843 # Export in VCF 4844 log.debug("Create initial file to annotate") 4845 tmp_vcf = NamedTemporaryFile( 4846 prefix=self.get_prefix(), 4847 dir=self.get_tmp_dir(), 4848 suffix=".vcf.gz", 4849 delete=True, 4850 ) 4851 tmp_vcf_name = tmp_vcf.name 4852 4853 # VCF header 4854 vcf_reader = self.get_header() 4855 log.debug("Initial header: " + str(vcf_reader.infos)) 4856 4857 # Existing annotations 4858 for vcf_annotation in self.get_header().infos: 4859 4860 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 4861 log.debug( 4862 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 4863 ) 4864 4865 # Memory limit 4866 # if config.get("memory", None): 4867 # memory_limit = config.get("memory", "8G") 4868 # else: 4869 # memory_limit = "8G" 4870 memory_limit = self.get_memory("8G") 4871 log.debug(f"memory_limit: {memory_limit}") 4872 4873 # snpEff java options 4874 snpeff_java_options = ( 4875 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4876 ) 4877 log.debug(f"Exomiser java options: {snpeff_java_options}") 4878 4879 force_update_annotation = True 4880 4881 if "ANN" not in self.get_header().infos or force_update_annotation: 4882 4883 # Check snpEff database 4884 log.debug(f"Check snpEff databases {[assembly]}") 4885 databases_download_snpeff( 4886 folder=snpeff_databases, assemblies=[assembly], config=config 4887 ) 4888 4889 # Export VCF file 4890 self.export_variant_vcf( 4891 vcf_file=tmp_vcf_name, 4892 remove_info=True, 4893 add_samples=False, 4894 index=True, 4895 ) 4896 4897 # Tmp file 4898 err_files = [] 4899 tmp_annotate_vcf = NamedTemporaryFile( 4900 prefix=self.get_prefix(), 4901 dir=self.get_tmp_dir(), 4902 suffix=".vcf", 4903 delete=False, 4904 ) 4905 tmp_annotate_vcf_name = tmp_annotate_vcf.name 4906 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 4907 err_files.append(tmp_annotate_vcf_name_err) 4908 4909 # Command 4910 snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}" 4911 log.debug(f"Annotation - snpEff command: {snpeff_command}") 4912 run_parallel_commands([snpeff_command], 1) 4913 4914 # Error messages 4915 log.info(f"Error/Warning messages:") 4916 error_message_command_all = [] 4917 error_message_command_warning = [] 4918 error_message_command_err = [] 4919 for err_file in err_files: 4920 with open(err_file, "r") as f: 4921 for line in f: 4922 message = line.strip() 4923 error_message_command_all.append(message) 4924 if line.startswith("[W::"): 4925 error_message_command_warning.append(message) 4926 if line.startswith("[E::"): 4927 error_message_command_err.append(f"{err_file}: " + message) 4928 # log info 4929 for message in list( 4930 set(error_message_command_err + error_message_command_warning) 4931 ): 4932 log.info(f" {message}") 4933 # debug info 4934 for message in list(set(error_message_command_all)): 4935 log.debug(f" {message}") 4936 # failed 4937 if len(error_message_command_err): 4938 log.error("Annotation failed: Error in commands") 4939 raise ValueError("Annotation failed: Error in commands") 4940 4941 # Find annotation in header 4942 with open(tmp_annotate_vcf_name, "rt") as f: 4943 header_list = self.read_vcf_header(f) 4944 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 4945 4946 for ann in annovar_vcf_header.infos: 4947 if ann not in self.get_header().infos: 4948 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 4949 4950 # Update variants 4951 log.info(f"Annotation - Updating...") 4952 self.update_from_vcf(tmp_annotate_vcf_name) 4953 4954 else: 4955 if "ANN" in self.get_header().infos: 4956 log.debug(f"Existing snpEff annotations in VCF") 4957 if force_update_annotation: 4958 log.debug(f"Existing snpEff annotations in VCF - annotation forced") 4959 4960 def annotation_annovar(self, threads: int = None) -> None: 4961 """ 4962 It takes a VCF file, annotates it with Annovar, and then updates the database with the new 4963 annotations 4964 4965 :param threads: number of threads to use 4966 :return: the value of the variable "return_value". 4967 """ 4968 4969 # DEBUG 4970 log.debug("Start annotation with Annovar databases") 4971 4972 # Threads 4973 if not threads: 4974 threads = self.get_threads() 4975 log.debug("Threads: " + str(threads)) 4976 4977 # Tmp en Err files 4978 tmp_files = [] 4979 err_files = [] 4980 4981 # DEBUG 4982 delete_tmp = True 4983 if self.get_config().get("verbosity", "warning") in ["debug"]: 4984 delete_tmp = False 4985 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4986 4987 # Config 4988 config = self.get_config() 4989 log.debug("Config: " + str(config)) 4990 4991 # Config - Folders - Databases 4992 databases_folders = ( 4993 config.get("folders", {}).get("databases", {}).get("annovar", ["."]) 4994 ) 4995 log.debug("Databases annotations: " + str(databases_folders)) 4996 4997 # Config - annovar bin command 4998 annovar_bin_command = get_bin_command( 4999 bin="table_annovar.pl", 5000 tool="annovar", 5001 bin_type="perl", 5002 config=config, 5003 default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar", 5004 ) 5005 if not annovar_bin_command: 5006 msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'" 5007 log.error(msg_err) 5008 raise ValueError(msg_err) 5009 5010 # Config - BCFTools bin command 5011 bcftools_bin_command = get_bin_command( 5012 bin="bcftools", 5013 tool="bcftools", 5014 bin_type="bin", 5015 config=config, 5016 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 5017 ) 5018 if not bcftools_bin_command: 5019 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 5020 log.error(msg_err) 5021 raise ValueError(msg_err) 5022 5023 # Config - annovar databases 5024 annovar_databases = ( 5025 config.get("folders", {}) 5026 .get("databases", {}) 5027 .get("annovar", DEFAULT_ANNOVAR_FOLDER) 5028 ) 5029 annovar_databases = full_path(annovar_databases) 5030 if annovar_databases != "" and not os.path.exists(annovar_databases): 5031 os.makedirs(annovar_databases) 5032 5033 # Param 5034 param = self.get_param() 5035 log.debug("Param: " + str(param)) 5036 5037 # Param - options 5038 options = param.get("annotation", {}).get("annovar", {}).get("options", {}) 5039 log.debug("Options: " + str(options)) 5040 5041 # Param - annotations 5042 annotations = ( 5043 param.get("annotation", {}).get("annovar", {}).get("annotations", {}) 5044 ) 5045 log.debug("Annotations: " + str(annotations)) 5046 5047 # Param - Assembly 5048 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5049 5050 # Annovar database assembly 5051 annovar_databases_assembly = f"{annovar_databases}/{assembly}" 5052 if annovar_databases_assembly != "" and not os.path.exists( 5053 annovar_databases_assembly 5054 ): 5055 os.makedirs(annovar_databases_assembly) 5056 5057 # Data 5058 table_variants = self.get_table_variants() 5059 5060 # Check if not empty 5061 log.debug("Check if not empty") 5062 sql_query_chromosomes = ( 5063 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5064 ) 5065 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 5066 if not sql_query_chromosomes_df["count"][0]: 5067 log.info(f"VCF empty") 5068 return 5069 5070 # VCF header 5071 vcf_reader = self.get_header() 5072 log.debug("Initial header: " + str(vcf_reader.infos)) 5073 5074 # Existing annotations 5075 for vcf_annotation in self.get_header().infos: 5076 5077 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5078 log.debug( 5079 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5080 ) 5081 5082 force_update_annotation = True 5083 5084 if annotations: 5085 5086 commands = [] 5087 tmp_annotates_vcf_name_list = [] 5088 5089 # Export in VCF 5090 log.debug("Create initial file to annotate") 5091 tmp_vcf = NamedTemporaryFile( 5092 prefix=self.get_prefix(), 5093 dir=self.get_tmp_dir(), 5094 suffix=".vcf.gz", 5095 delete=False, 5096 ) 5097 tmp_vcf_name = tmp_vcf.name 5098 tmp_files.append(tmp_vcf_name) 5099 tmp_files.append(tmp_vcf_name + ".tbi") 5100 5101 # Export VCF file 5102 self.export_variant_vcf( 5103 vcf_file=tmp_vcf_name, 5104 remove_info=".", 5105 add_samples=False, 5106 index=True, 5107 ) 5108 5109 # Create file for field rename 5110 log.debug("Create file for field rename") 5111 tmp_rename = NamedTemporaryFile( 5112 prefix=self.get_prefix(), 5113 dir=self.get_tmp_dir(), 5114 suffix=".rename", 5115 delete=False, 5116 ) 5117 tmp_rename_name = tmp_rename.name 5118 tmp_files.append(tmp_rename_name) 5119 5120 # Check Annovar database 5121 log.debug( 5122 f"Check Annovar databases {[assembly]}: {list(annotations.keys())}" 5123 ) 5124 databases_download_annovar( 5125 folder=annovar_databases, 5126 files=list(annotations.keys()), 5127 assemblies=[assembly], 5128 ) 5129 5130 for annotation in annotations: 5131 annotation_fields = annotations[annotation] 5132 5133 if not annotation_fields: 5134 annotation_fields = {"INFO": None} 5135 5136 log.info(f"Annotations Annovar - database '{annotation}'") 5137 log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}") 5138 5139 # Tmp file for annovar 5140 err_files = [] 5141 tmp_annotate_vcf_directory = TemporaryDirectory( 5142 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar" 5143 ) 5144 tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar" 5145 tmp_annotate_vcf_name_annovar = ( 5146 tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf" 5147 ) 5148 tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err" 5149 err_files.append(tmp_annotate_vcf_name_err) 5150 tmp_files.append(tmp_annotate_vcf_name_err) 5151 5152 # Tmp file final vcf annotated by annovar 5153 tmp_annotate_vcf = NamedTemporaryFile( 5154 prefix=self.get_prefix(), 5155 dir=self.get_tmp_dir(), 5156 suffix=".vcf.gz", 5157 delete=False, 5158 ) 5159 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5160 tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name) 5161 tmp_files.append(tmp_annotate_vcf_name) 5162 tmp_files.append(tmp_annotate_vcf_name + ".tbi") 5163 5164 # Number of fields 5165 annotation_list = [] 5166 annotation_renamed_list = [] 5167 5168 for annotation_field in annotation_fields: 5169 5170 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 5171 annotation_fields_new_name = annotation_fields.get( 5172 annotation_field, annotation_field 5173 ) 5174 if not annotation_fields_new_name: 5175 annotation_fields_new_name = annotation_field 5176 5177 if ( 5178 force_update_annotation 5179 or annotation_fields_new_name not in self.get_header().infos 5180 ): 5181 annotation_list.append(annotation_field) 5182 annotation_renamed_list.append(annotation_fields_new_name) 5183 else: # annotation_fields_new_name in self.get_header().infos and not force_update_annotation: 5184 log.warning( 5185 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 5186 ) 5187 5188 # Add rename info 5189 run_parallel_commands( 5190 [ 5191 f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}" 5192 ], 5193 1, 5194 ) 5195 5196 # log.debug("fields_to_removed: " + str(fields_to_removed)) 5197 log.debug("annotation_list: " + str(annotation_list)) 5198 5199 # protocol 5200 protocol = annotation 5201 5202 # argument 5203 argument = "" 5204 5205 # operation 5206 operation = "f" 5207 if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith( 5208 "ensGene" 5209 ): 5210 operation = "g" 5211 if options.get("genebase", None): 5212 argument = f"""'{options.get("genebase","")}'""" 5213 elif annotation in ["cytoBand"]: 5214 operation = "r" 5215 5216 # argument option 5217 argument_option = "" 5218 if argument != "": 5219 argument_option = " --argument " + argument 5220 5221 # command options 5222 command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """ # --intronhgvs 10 5223 for option in options: 5224 if option not in ["genebase"]: 5225 command_options += f""" --{option}={options[option]}""" 5226 5227 # Command 5228 5229 # Command - Annovar 5230 command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """ 5231 tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf") 5232 5233 # Command - start pipe 5234 command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """ 5235 5236 # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!) 5237 command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """ 5238 5239 # Command - Special characters (refGene annotation) 5240 command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """ 5241 5242 # Command - Clean empty fields (with value ".") 5243 command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """ 5244 5245 # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file 5246 annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"] 5247 if "ALL" not in annotation_list and "INFO" not in annotation_list: 5248 # for ann in annotation_renamed_list: 5249 for ann in annotation_list: 5250 annovar_fields_to_keep.append(f"^INFO/{ann}") 5251 5252 command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """ 5253 5254 # Command - indexing 5255 command_annovar += f""" && tabix {tmp_annotate_vcf_name} """ 5256 5257 log.debug(f"Annotation - Annovar command: {command_annovar}") 5258 run_parallel_commands([command_annovar], 1) 5259 5260 # Error messages 5261 log.info(f"Error/Warning messages:") 5262 error_message_command_all = [] 5263 error_message_command_warning = [] 5264 error_message_command_err = [] 5265 for err_file in err_files: 5266 with open(err_file, "r") as f: 5267 for line in f: 5268 message = line.strip() 5269 error_message_command_all.append(message) 5270 if line.startswith("[W::") or line.startswith("WARNING"): 5271 error_message_command_warning.append(message) 5272 if line.startswith("[E::") or line.startswith("ERROR"): 5273 error_message_command_err.append( 5274 f"{err_file}: " + message 5275 ) 5276 # log info 5277 for message in list( 5278 set(error_message_command_err + error_message_command_warning) 5279 ): 5280 log.info(f" {message}") 5281 # debug info 5282 for message in list(set(error_message_command_all)): 5283 log.debug(f" {message}") 5284 # failed 5285 if len(error_message_command_err): 5286 log.error("Annotation failed: Error in commands") 5287 raise ValueError("Annotation failed: Error in commands") 5288 5289 if tmp_annotates_vcf_name_list: 5290 5291 # List of annotated files 5292 tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list) 5293 5294 # Tmp file 5295 tmp_annotate_vcf = NamedTemporaryFile( 5296 prefix=self.get_prefix(), 5297 dir=self.get_tmp_dir(), 5298 suffix=".vcf.gz", 5299 delete=False, 5300 ) 5301 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5302 tmp_files.append(tmp_annotate_vcf_name) 5303 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5304 err_files.append(tmp_annotate_vcf_name_err) 5305 tmp_files.append(tmp_annotate_vcf_name_err) 5306 5307 # Command merge 5308 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} " 5309 log.info( 5310 f"Annotation Annovar - Annotation merging " 5311 + str(len(tmp_annotates_vcf_name_list)) 5312 + " annotated files" 5313 ) 5314 log.debug(f"Annotation - merge command: {merge_command}") 5315 run_parallel_commands([merge_command], 1) 5316 5317 # Find annotation in header 5318 with bgzf.open(tmp_annotate_vcf_name, "rt") as f: 5319 header_list = self.read_vcf_header(f) 5320 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5321 5322 for ann in annovar_vcf_header.infos: 5323 if ann not in self.get_header().infos: 5324 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5325 5326 # Update variants 5327 log.info(f"Annotation Annovar - Updating...") 5328 self.update_from_vcf(tmp_annotate_vcf_name) 5329 5330 # Clean files 5331 # Tmp file remove command 5332 if True: 5333 tmp_files_remove_command = "" 5334 if tmp_files: 5335 tmp_files_remove_command = " ".join(tmp_files) 5336 clean_command = f" rm -f {tmp_files_remove_command} " 5337 log.debug(f"Annotation Annovar - Annotation cleaning ") 5338 log.debug(f"Annotation - cleaning command: {clean_command}") 5339 run_parallel_commands([clean_command], 1) 5340 5341 # Parquet 5342 def annotation_parquet(self, threads: int = None) -> None: 5343 """ 5344 It takes a VCF file, and annotates it with a parquet file 5345 5346 :param threads: number of threads to use for the annotation 5347 :return: the value of the variable "result". 5348 """ 5349 5350 # DEBUG 5351 log.debug("Start annotation with parquet databases") 5352 5353 # Threads 5354 if not threads: 5355 threads = self.get_threads() 5356 log.debug("Threads: " + str(threads)) 5357 5358 # DEBUG 5359 delete_tmp = True 5360 if self.get_config().get("verbosity", "warning") in ["debug"]: 5361 delete_tmp = False 5362 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5363 5364 # Config 5365 databases_folders = set( 5366 self.get_config() 5367 .get("folders", {}) 5368 .get("databases", {}) 5369 .get("annotations", ["."]) 5370 + self.get_config() 5371 .get("folders", {}) 5372 .get("databases", {}) 5373 .get("parquet", ["."]) 5374 ) 5375 log.debug("Databases annotations: " + str(databases_folders)) 5376 5377 # Param 5378 annotations = ( 5379 self.get_param() 5380 .get("annotation", {}) 5381 .get("parquet", {}) 5382 .get("annotations", None) 5383 ) 5384 log.debug("Annotations: " + str(annotations)) 5385 5386 # Assembly 5387 assembly = self.get_param().get( 5388 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 5389 ) 5390 5391 # Force Update Annotation 5392 force_update_annotation = ( 5393 self.get_param() 5394 .get("annotation", {}) 5395 .get("options", {}) 5396 .get("annotations_update", False) 5397 ) 5398 log.debug(f"force_update_annotation={force_update_annotation}") 5399 force_append_annotation = ( 5400 self.get_param() 5401 .get("annotation", {}) 5402 .get("options", {}) 5403 .get("annotations_append", False) 5404 ) 5405 log.debug(f"force_append_annotation={force_append_annotation}") 5406 5407 # Data 5408 table_variants = self.get_table_variants() 5409 5410 # Check if not empty 5411 log.debug("Check if not empty") 5412 sql_query_chromosomes_df = self.get_query_to_df( 5413 f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1""" 5414 ) 5415 if not sql_query_chromosomes_df["count"][0]: 5416 log.info(f"VCF empty") 5417 return 5418 5419 # VCF header 5420 vcf_reader = self.get_header() 5421 log.debug("Initial header: " + str(vcf_reader.infos)) 5422 5423 # Nb Variants POS 5424 log.debug("NB Variants Start") 5425 nb_variants = self.conn.execute( 5426 f"SELECT count(*) AS count FROM variants" 5427 ).fetchdf()["count"][0] 5428 log.debug("NB Variants Stop") 5429 5430 # Existing annotations 5431 for vcf_annotation in self.get_header().infos: 5432 5433 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5434 log.debug( 5435 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5436 ) 5437 5438 # Added columns 5439 added_columns = [] 5440 5441 # drop indexes 5442 log.debug(f"Drop indexes...") 5443 self.drop_indexes() 5444 5445 if annotations: 5446 5447 if "ALL" in annotations: 5448 5449 all_param = annotations.get("ALL", {}) 5450 all_param_formats = all_param.get("formats", None) 5451 all_param_releases = all_param.get("releases", None) 5452 5453 databases_infos_dict = self.scan_databases( 5454 database_formats=all_param_formats, 5455 database_releases=all_param_releases, 5456 ) 5457 for database_infos in databases_infos_dict.keys(): 5458 if database_infos not in annotations: 5459 annotations[database_infos] = {"INFO": None} 5460 5461 for annotation in annotations: 5462 5463 if annotation in ["ALL"]: 5464 continue 5465 5466 # Annotation Name 5467 annotation_name = os.path.basename(annotation) 5468 5469 # Annotation fields 5470 annotation_fields = annotations[annotation] 5471 if not annotation_fields: 5472 annotation_fields = {"INFO": None} 5473 5474 log.debug(f"Annotation '{annotation_name}'") 5475 log.debug( 5476 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 5477 ) 5478 5479 # Create Database 5480 database = Database( 5481 database=annotation, 5482 databases_folders=databases_folders, 5483 assembly=assembly, 5484 ) 5485 5486 # Find files 5487 parquet_file = database.get_database() 5488 parquet_hdr_file = database.get_header_file() 5489 parquet_type = database.get_type() 5490 5491 # Check if files exists 5492 if not parquet_file or not parquet_hdr_file: 5493 log.error("Annotation failed: file not found") 5494 raise ValueError("Annotation failed: file not found") 5495 else: 5496 # Get parquet connexion 5497 parquet_sql_attach = database.get_sql_database_attach( 5498 output="query" 5499 ) 5500 if parquet_sql_attach: 5501 self.conn.execute(parquet_sql_attach) 5502 parquet_file_link = database.get_sql_database_link() 5503 # Log 5504 log.debug( 5505 f"Annotation '{annotation_name}' - file: " 5506 + str(parquet_file) 5507 + " and " 5508 + str(parquet_hdr_file) 5509 ) 5510 5511 # Database full header columns 5512 parquet_hdr_vcf_header_columns = database.get_header_file_columns( 5513 parquet_hdr_file 5514 ) 5515 # Log 5516 log.debug( 5517 "Annotation database header columns : " 5518 + str(parquet_hdr_vcf_header_columns) 5519 ) 5520 5521 # Load header as VCF object 5522 parquet_hdr_vcf_header_infos = database.get_header().infos 5523 # Log 5524 log.debug( 5525 "Annotation database header: " 5526 + str(parquet_hdr_vcf_header_infos) 5527 ) 5528 5529 # Get extra infos 5530 parquet_columns = database.get_extra_columns() 5531 # Log 5532 log.debug("Annotation database Columns: " + str(parquet_columns)) 5533 5534 # Add extra columns if "ALL" in annotation_fields 5535 # if "ALL" in annotation_fields: 5536 # allow_add_extra_column = True 5537 if "ALL" in annotation_fields and database.get_extra_columns(): 5538 for extra_column in database.get_extra_columns(): 5539 if ( 5540 extra_column not in annotation_fields 5541 and extra_column.replace("INFO/", "") 5542 not in parquet_hdr_vcf_header_infos 5543 ): 5544 parquet_hdr_vcf_header_infos[extra_column] = ( 5545 vcf.parser._Info( 5546 extra_column, 5547 ".", 5548 "String", 5549 f"{extra_column} description", 5550 "unknown", 5551 "unknown", 5552 self.code_type_map["String"], 5553 ) 5554 ) 5555 5556 # For all fields in database 5557 annotation_fields_all = False 5558 if "ALL" in annotation_fields or "INFO" in annotation_fields: 5559 annotation_fields_all = True 5560 annotation_fields = { 5561 key: key for key in parquet_hdr_vcf_header_infos 5562 } 5563 5564 log.debug( 5565 "Annotation database header - All annotations added: " 5566 + str(annotation_fields) 5567 ) 5568 5569 # Init 5570 5571 # List of annotation fields to use 5572 sql_query_annotation_update_info_sets = [] 5573 5574 # List of annotation to agregate 5575 sql_query_annotation_to_agregate = [] 5576 5577 # Number of fields 5578 nb_annotation_field = 0 5579 5580 # Annotation fields processed 5581 annotation_fields_processed = [] 5582 5583 # Columns mapping 5584 map_columns = database.map_columns( 5585 columns=annotation_fields, prefixes=["INFO/"] 5586 ) 5587 5588 # Query dict for fields to remove (update option) 5589 query_dict_remove = {} 5590 5591 # Fetch Anotation fields 5592 for annotation_field in annotation_fields: 5593 5594 # annotation_field_column 5595 annotation_field_column = map_columns.get( 5596 annotation_field, "INFO" 5597 ) 5598 5599 # field new name, if parametered 5600 annotation_fields_new_name = annotation_fields.get( 5601 annotation_field, annotation_field 5602 ) 5603 if not annotation_fields_new_name: 5604 annotation_fields_new_name = annotation_field 5605 5606 # To annotate 5607 # force_update_annotation = True 5608 # force_append_annotation = True 5609 # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)): 5610 if annotation_field in parquet_hdr_vcf_header_infos and ( 5611 force_update_annotation 5612 or force_append_annotation 5613 or ( 5614 annotation_fields_new_name 5615 not in self.get_header().infos 5616 ) 5617 ): 5618 5619 # Add field to annotation to process list 5620 annotation_fields_processed.append( 5621 annotation_fields_new_name 5622 ) 5623 5624 # explode infos for the field 5625 annotation_fields_new_name_info_msg = "" 5626 if ( 5627 force_update_annotation 5628 and annotation_fields_new_name 5629 in self.get_header().infos 5630 ): 5631 # Remove field from INFO 5632 query = f""" 5633 UPDATE {table_variants} as table_variants 5634 SET INFO = REGEXP_REPLACE( 5635 concat(table_variants.INFO,''), 5636 ';*{annotation_fields_new_name}=[^;]*', 5637 '' 5638 ) 5639 WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%' 5640 """ 5641 annotation_fields_new_name_info_msg = " [update]" 5642 query_dict_remove[ 5643 f"remove 'INFO/{annotation_fields_new_name}'" 5644 ] = query 5645 5646 # Sep between fields in INFO 5647 nb_annotation_field += 1 5648 if nb_annotation_field > 1: 5649 annotation_field_sep = ";" 5650 else: 5651 annotation_field_sep = "" 5652 5653 log.info( 5654 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}" 5655 ) 5656 5657 # Add INFO field to header 5658 parquet_hdr_vcf_header_infos_number = ( 5659 parquet_hdr_vcf_header_infos[annotation_field].num 5660 or "." 5661 ) 5662 parquet_hdr_vcf_header_infos_type = ( 5663 parquet_hdr_vcf_header_infos[annotation_field].type 5664 or "String" 5665 ) 5666 parquet_hdr_vcf_header_infos_description = ( 5667 parquet_hdr_vcf_header_infos[annotation_field].desc 5668 or f"{annotation_field} description" 5669 ) 5670 parquet_hdr_vcf_header_infos_source = ( 5671 parquet_hdr_vcf_header_infos[annotation_field].source 5672 or "unknown" 5673 ) 5674 parquet_hdr_vcf_header_infos_version = ( 5675 parquet_hdr_vcf_header_infos[annotation_field].version 5676 or "unknown" 5677 ) 5678 5679 vcf_reader.infos[annotation_fields_new_name] = ( 5680 vcf.parser._Info( 5681 annotation_fields_new_name, 5682 parquet_hdr_vcf_header_infos_number, 5683 parquet_hdr_vcf_header_infos_type, 5684 parquet_hdr_vcf_header_infos_description, 5685 parquet_hdr_vcf_header_infos_source, 5686 parquet_hdr_vcf_header_infos_version, 5687 self.code_type_map[ 5688 parquet_hdr_vcf_header_infos_type 5689 ], 5690 ) 5691 ) 5692 5693 # Append 5694 if force_append_annotation: 5695 query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """ 5696 else: 5697 query_case_when_append = "" 5698 5699 # Annotation/Update query fields 5700 # Found in INFO column 5701 if ( 5702 annotation_field_column == "INFO" 5703 and "INFO" in parquet_hdr_vcf_header_columns 5704 ): 5705 sql_query_annotation_update_info_sets.append( 5706 f""" 5707 CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append} 5708 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1)) 5709 ELSE '' 5710 END 5711 """ 5712 ) 5713 # Found in a specific column 5714 else: 5715 sql_query_annotation_update_info_sets.append( 5716 f""" 5717 CASE WHEN table_parquet."{annotation_field_column}" NOT IN ('','.') {query_case_when_append} 5718 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(table_parquet."{annotation_field_column}", ';', ',')) 5719 ELSE '' 5720 END 5721 """ 5722 ) 5723 sql_query_annotation_to_agregate.append( 5724 f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ 5725 ) 5726 5727 # Not to annotate 5728 else: 5729 5730 if force_update_annotation: 5731 annotation_message = "forced" 5732 else: 5733 annotation_message = "skipped" 5734 5735 if annotation_field not in parquet_hdr_vcf_header_infos: 5736 log.warning( 5737 f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file" 5738 ) 5739 if annotation_fields_new_name in self.get_header().infos: 5740 log.warning( 5741 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})" 5742 ) 5743 5744 # Check if ALL fields have to be annotated. Thus concat all INFO field 5745 # allow_annotation_full_info = True 5746 allow_annotation_full_info = not force_append_annotation 5747 5748 if parquet_type in ["regions"]: 5749 allow_annotation_full_info = False 5750 5751 if ( 5752 allow_annotation_full_info 5753 and nb_annotation_field == len(annotation_fields) 5754 and annotation_fields_all 5755 and ( 5756 "INFO" in parquet_hdr_vcf_header_columns 5757 and "INFO" in database.get_extra_columns() 5758 ) 5759 ): 5760 log.debug("Column INFO annotation enabled") 5761 sql_query_annotation_update_info_sets = [] 5762 sql_query_annotation_update_info_sets.append( 5763 f" table_parquet.INFO " 5764 ) 5765 5766 if sql_query_annotation_update_info_sets: 5767 5768 # Annotate 5769 log.info(f"Annotation '{annotation_name}' - Annotation...") 5770 5771 # Join query annotation update info sets for SQL 5772 sql_query_annotation_update_info_sets_sql = ",".join( 5773 sql_query_annotation_update_info_sets 5774 ) 5775 5776 # Check chromosomes list (and variants infos) 5777 sql_query_chromosomes = f""" 5778 SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants 5779 FROM {table_variants} as table_variants 5780 GROUP BY table_variants."#CHROM" 5781 ORDER BY table_variants."#CHROM" 5782 """ 5783 sql_query_chromosomes_df = self.conn.execute( 5784 sql_query_chromosomes 5785 ).df() 5786 sql_query_chromosomes_dict = { 5787 entry["CHROM"]: { 5788 "count": entry["count_variants"], 5789 "min": entry["min_variants"], 5790 "max": entry["max_variants"], 5791 } 5792 for index, entry in sql_query_chromosomes_df.iterrows() 5793 } 5794 5795 # Init 5796 nb_of_query = 0 5797 nb_of_variant_annotated = 0 5798 query_dict = query_dict_remove 5799 5800 # for chrom in sql_query_chromosomes_df["CHROM"]: 5801 for chrom in sql_query_chromosomes_dict: 5802 5803 # Number of variant by chromosome 5804 nb_of_variant_by_chrom = sql_query_chromosomes_dict.get( 5805 chrom, {} 5806 ).get("count", 0) 5807 5808 log.debug( 5809 f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..." 5810 ) 5811 5812 # Annotation with regions database 5813 if parquet_type in ["regions"]: 5814 sql_query_annotation_from_clause = f""" 5815 FROM ( 5816 SELECT 5817 '{chrom}' AS \"#CHROM\", 5818 table_variants_from.\"POS\" AS \"POS\", 5819 {",".join(sql_query_annotation_to_agregate)} 5820 FROM {table_variants} as table_variants_from 5821 LEFT JOIN {parquet_file_link} as table_parquet_from ON ( 5822 table_parquet_from."#CHROM" = '{chrom}' 5823 AND table_variants_from.\"POS\" <= table_parquet_from.\"END\" 5824 AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1) 5825 OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1) 5826 ) 5827 ) 5828 WHERE table_variants_from.\"#CHROM\" in ('{chrom}') 5829 GROUP BY table_variants_from.\"POS\" 5830 ) 5831 as table_parquet 5832 """ 5833 5834 sql_query_annotation_where_clause = """ 5835 table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 5836 AND table_parquet.\"POS\" = table_variants.\"POS\" 5837 """ 5838 5839 # Annotation with variants database 5840 else: 5841 sql_query_annotation_from_clause = f""" 5842 FROM {parquet_file_link} as table_parquet 5843 """ 5844 sql_query_annotation_where_clause = f""" 5845 table_variants."#CHROM" = '{chrom}' 5846 AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 5847 AND table_parquet.\"POS\" = table_variants.\"POS\" 5848 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 5849 AND table_parquet.\"REF\" = table_variants.\"REF\" 5850 """ 5851 5852 # Create update query 5853 sql_query_annotation_chrom_interval_pos = f""" 5854 UPDATE {table_variants} as table_variants 5855 SET INFO = 5856 concat( 5857 CASE WHEN table_variants.INFO NOT IN ('','.') 5858 THEN table_variants.INFO 5859 ELSE '' 5860 END 5861 , 5862 CASE WHEN table_variants.INFO NOT IN ('','.') 5863 AND ( 5864 concat({sql_query_annotation_update_info_sets_sql}) 5865 ) 5866 NOT IN ('','.') 5867 THEN ';' 5868 ELSE '' 5869 END 5870 , 5871 {sql_query_annotation_update_info_sets_sql} 5872 ) 5873 {sql_query_annotation_from_clause} 5874 WHERE {sql_query_annotation_where_clause} 5875 ; 5876 """ 5877 5878 # Add update query to dict 5879 query_dict[ 5880 f"{chrom} [{nb_of_variant_by_chrom} variants]" 5881 ] = sql_query_annotation_chrom_interval_pos 5882 5883 nb_of_query = len(query_dict) 5884 num_query = 0 5885 5886 # SET max_expression_depth TO x 5887 self.conn.execute("SET max_expression_depth TO 10000") 5888 5889 for query_name in query_dict: 5890 query = query_dict[query_name] 5891 num_query += 1 5892 log.info( 5893 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..." 5894 ) 5895 result = self.conn.execute(query) 5896 nb_of_variant_annotated_by_query = result.df()["Count"][0] 5897 nb_of_variant_annotated += nb_of_variant_annotated_by_query 5898 log.info( 5899 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated" 5900 ) 5901 5902 log.info( 5903 f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)" 5904 ) 5905 5906 else: 5907 5908 log.info( 5909 f"Annotation '{annotation_name}' - No Annotations available" 5910 ) 5911 5912 log.debug("Final header: " + str(vcf_reader.infos)) 5913 5914 # Remove added columns 5915 for added_column in added_columns: 5916 self.drop_column(column=added_column) 5917 5918 def annotation_splice(self, threads: int = None) -> None: 5919 """ 5920 This function annotate with snpEff 5921 5922 :param threads: The number of threads to use 5923 :return: the value of the variable "return_value". 5924 """ 5925 5926 # DEBUG 5927 log.debug("Start annotation with splice tools") 5928 5929 # Threads 5930 if not threads: 5931 threads = self.get_threads() 5932 log.debug("Threads: " + str(threads)) 5933 5934 # DEBUG 5935 delete_tmp = True 5936 if self.get_config().get("verbosity", "warning") in ["debug"]: 5937 delete_tmp = False 5938 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5939 5940 # Config 5941 config = self.get_config() 5942 log.debug("Config: " + str(config)) 5943 splice_config = config.get("tools", {}).get("splice", {}) 5944 if not splice_config: 5945 splice_config = DEFAULT_TOOLS_BIN.get("splice", {}) 5946 if not splice_config: 5947 msg_err = "No Splice tool config" 5948 log.error(msg_err) 5949 raise ValueError(msg_err) 5950 log.debug(f"splice_config={splice_config}") 5951 5952 # Config - Folders - Databases 5953 databases_folders = ( 5954 config.get("folders", {}).get("databases", {}).get("splice", ["."]) 5955 ) 5956 log.debug("Databases annotations: " + str(databases_folders)) 5957 5958 # Splice docker image 5959 splice_docker_image = splice_config.get("docker").get("image") 5960 5961 # Pull splice image if it's not already there 5962 if not check_docker_image_exists(splice_docker_image): 5963 log.warning( 5964 f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub" 5965 ) 5966 try: 5967 command(f"docker pull {splice_config.get('docker').get('image')}") 5968 except subprocess.CalledProcessError: 5969 msg_err = f"Unable to find docker {splice_docker_image} on dockerhub" 5970 log.error(msg_err) 5971 raise ValueError(msg_err) 5972 return None 5973 5974 # Config - splice databases 5975 splice_databases = ( 5976 config.get("folders", {}) 5977 .get("databases", {}) 5978 .get("splice", DEFAULT_SPLICE_FOLDER) 5979 ) 5980 splice_databases = full_path(splice_databases) 5981 5982 # Param 5983 param = self.get_param() 5984 log.debug("Param: " + str(param)) 5985 5986 # Param 5987 options = param.get("annotation", {}).get("splice", {}) 5988 log.debug("Options: " + str(options)) 5989 5990 # Data 5991 table_variants = self.get_table_variants() 5992 5993 # Check if not empty 5994 log.debug("Check if not empty") 5995 sql_query_chromosomes = ( 5996 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5997 ) 5998 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 5999 log.info("VCF empty") 6000 return None 6001 6002 # Export in VCF 6003 log.debug("Create initial file to annotate") 6004 6005 # Create output folder 6006 output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}") 6007 if not os.path.exists(output_folder): 6008 Path(output_folder).mkdir(parents=True, exist_ok=True) 6009 6010 # Create tmp VCF file 6011 tmp_vcf = NamedTemporaryFile( 6012 prefix=self.get_prefix(), 6013 dir=output_folder, 6014 suffix=".vcf", 6015 delete=False, 6016 ) 6017 tmp_vcf_name = tmp_vcf.name 6018 6019 # VCF header 6020 header = self.get_header() 6021 6022 # Existing annotations 6023 for vcf_annotation in self.get_header().infos: 6024 6025 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 6026 log.debug( 6027 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 6028 ) 6029 6030 # Memory limit 6031 if config.get("memory", None): 6032 memory_limit = config.get("memory", "8G").upper() 6033 # upper() 6034 else: 6035 memory_limit = "8G" 6036 log.debug(f"memory_limit: {memory_limit}") 6037 6038 # Check number of variants to annotate 6039 where_clause_regex_spliceai = r"SpliceAI_\w+" 6040 where_clause_regex_spip = r"SPiP_\w+" 6041 where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')""" 6042 df_list_of_variants_to_annotate = self.get_query_to_df( 6043 query=f""" SELECT * FROM variants {where_clause} """ 6044 ) 6045 if len(df_list_of_variants_to_annotate) == 0: 6046 log.warning( 6047 f"No variants to annotate with splice. Variants probably already annotated with splice" 6048 ) 6049 return None 6050 else: 6051 log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants") 6052 6053 # Export VCF file 6054 self.export_variant_vcf( 6055 vcf_file=tmp_vcf_name, 6056 remove_info=True, 6057 add_samples=True, 6058 index=False, 6059 where_clause=where_clause, 6060 ) 6061 6062 # Create docker container and launch splice analysis 6063 if splice_config: 6064 6065 # Splice mount folders 6066 mount_folders = splice_config.get("mount", {}) 6067 6068 # Genome mount 6069 mount_folders[ 6070 config.get("folders", {}) 6071 .get("databases", {}) 6072 .get("genomes", DEFAULT_GENOME_FOLDER) 6073 ] = "ro" 6074 6075 # SpliceAI mount 6076 mount_folders[ 6077 config.get("folders", {}) 6078 .get("databases", {}) 6079 .get("spliceai", DEFAULT_SPLICEAI_FOLDER) 6080 ] = "ro" 6081 6082 # Genome mount 6083 mount_folders[ 6084 config.get("folders", {}) 6085 .get("databases", {}) 6086 .get("spip", DEFAULT_SPIP_FOLDER) 6087 ] = "ro" 6088 6089 # Mount folders 6090 mount = [] 6091 6092 # Config mount 6093 mount = [ 6094 f"-v {full_path(path)}:{full_path(path)}:{mode}" 6095 for path, mode in mount_folders.items() 6096 ] 6097 6098 if any(value for value in splice_config.values() if value is None): 6099 log.warning("At least one splice config parameter is empty") 6100 return None 6101 6102 # Params in splice nf 6103 def check_values(dico: dict): 6104 """ 6105 Ensure parameters for NF splice pipeline 6106 """ 6107 for key, val in dico.items(): 6108 if key == "genome": 6109 if any( 6110 assemb in options.get("genome", {}) 6111 for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"] 6112 ): 6113 yield f"--{key} hg19" 6114 elif any( 6115 assemb in options.get("genome", {}) 6116 for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"] 6117 ): 6118 yield f"--{key} hg38" 6119 elif ( 6120 (isinstance(val, str) and val) 6121 or isinstance(val, int) 6122 or isinstance(val, bool) 6123 ): 6124 yield f"--{key} {val}" 6125 6126 # Genome 6127 genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY)) 6128 options["genome"] = genome 6129 6130 # NF params 6131 nf_params = [] 6132 6133 # Add options 6134 if options: 6135 nf_params = list(check_values(options)) 6136 log.debug(f"Splice NF params: {' '.join(nf_params)}") 6137 else: 6138 log.debug("No NF params provided") 6139 6140 # Add threads 6141 if "threads" not in options.keys(): 6142 nf_params.append(f"--threads {threads}") 6143 6144 # Genome path 6145 genome_path = find_genome( 6146 config.get("folders", {}) 6147 .get("databases", {}) 6148 .get("genomes", DEFAULT_GENOME_FOLDER), 6149 file=f"{genome}.fa", 6150 ) 6151 # Add genome path 6152 if not genome_path: 6153 raise ValueError( 6154 f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}" 6155 ) 6156 else: 6157 log.debug(f"Genome: {genome_path}") 6158 nf_params.append(f"--genome_path {genome_path}") 6159 6160 def splice_annotations(options: dict = {}, config: dict = {}) -> list: 6161 """ 6162 Setting up updated databases for SPiP and SpliceAI 6163 """ 6164 6165 try: 6166 6167 # SpliceAI assembly transcriptome 6168 spliceai_assembly = os.path.join( 6169 config.get("folders", {}) 6170 .get("databases", {}) 6171 .get("spliceai", {}), 6172 options.get("genome"), 6173 "transcriptome", 6174 ) 6175 spip_assembly = options.get("genome") 6176 6177 spip = find( 6178 f"transcriptome_{spip_assembly}.RData", 6179 config.get("folders", {}).get("databases", {}).get("spip", {}), 6180 ) 6181 spliceai = find("spliceai.refseq.txt", spliceai_assembly) 6182 log.debug(f"SPiP annotations: {spip}") 6183 log.debug(f"SpliceAI annotations: {spliceai}") 6184 if spip and spliceai: 6185 return [ 6186 f"--spip_transcriptome {spip}", 6187 f"--spliceai_annotations {spliceai}", 6188 ] 6189 else: 6190 # TODO crash and go on with basic annotations ? 6191 # raise ValueError( 6192 # "Can't find splice databases in configuration EXIT" 6193 # ) 6194 log.warning( 6195 "Can't find splice databases in configuration, use annotations file from image" 6196 ) 6197 except TypeError: 6198 log.warning( 6199 "Can't find splice databases in configuration, use annotations file from image" 6200 ) 6201 return [] 6202 6203 # Add options, check if transcriptome option have already beend provided 6204 if ( 6205 "spip_transcriptome" not in nf_params 6206 and "spliceai_transcriptome" not in nf_params 6207 ): 6208 splice_reference = splice_annotations(options, config) 6209 if splice_reference: 6210 nf_params.extend(splice_reference) 6211 6212 nf_params.append(f"--output_folder {output_folder}") 6213 6214 random_uuid = f"HOWARD-SPLICE-{get_random()}" 6215 cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline" 6216 log.debug(cmd) 6217 6218 splice_config["docker"]["command"] = cmd 6219 6220 docker_cmd = get_bin_command( 6221 tool="splice", 6222 bin_type="docker", 6223 config=config, 6224 default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker", 6225 add_options=f"--name {random_uuid} {' '.join(mount)}", 6226 ) 6227 6228 # Docker debug 6229 # if splice_config.get("rm_container"): 6230 # rm_container = "--rm" 6231 # else: 6232 # rm_container = "" 6233 # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}" 6234 6235 log.debug(docker_cmd) 6236 res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True) 6237 log.debug(res.stdout) 6238 if res.stderr: 6239 log.error(res.stderr) 6240 res.check_returncode() 6241 else: 6242 log.warning(f"Splice tool configuration not found: {config}") 6243 6244 # Update variants 6245 log.info("Annotation - Updating...") 6246 # Test find output vcf 6247 log.debug( 6248 f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6249 ) 6250 output_vcf = [] 6251 # Wrong folder to look in 6252 for files in os.listdir(os.path.dirname(tmp_vcf_name)): 6253 if ( 6254 files 6255 == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6256 ): 6257 output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files)) 6258 # log.debug(os.listdir(options.get("output_folder"))) 6259 log.debug(f"Splice annotated vcf: {output_vcf[0]}") 6260 if not output_vcf: 6261 log.debug( 6262 f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz" 6263 ) 6264 else: 6265 # Get new header from annotated vcf 6266 log.debug(f"Initial header: {len(header.infos)} fields") 6267 # Create new header with splice infos 6268 new_vcf = Variants(input=output_vcf[0]) 6269 new_vcf_header = new_vcf.get_header().infos 6270 for keys, infos in new_vcf_header.items(): 6271 if keys not in header.infos.keys(): 6272 header.infos[keys] = infos 6273 log.debug(f"New header: {len(header.infos)} fields") 6274 log.debug(f"Splice tmp output: {output_vcf[0]}") 6275 self.update_from_vcf(output_vcf[0]) 6276 6277 # Remove folder 6278 remove_if_exists(output_folder) 6279 6280 ### 6281 # Prioritization 6282 ### 6283 6284 def get_config_default(self, name: str) -> dict: 6285 """ 6286 The function `get_config_default` returns a dictionary containing default configurations for 6287 various calculations and prioritizations. 6288 6289 :param name: The `get_config_default` function returns a dictionary containing default 6290 configurations for different calculations and prioritizations. The `name` parameter is used to 6291 specify which specific configuration to retrieve from the dictionary 6292 :type name: str 6293 :return: The function `get_config_default` returns a dictionary containing default configuration 6294 settings for different calculations and prioritizations. The specific configuration settings are 6295 retrieved based on the input `name` parameter provided to the function. If the `name` parameter 6296 matches a key in the `config_default` dictionary, the corresponding configuration settings are 6297 returned. If there is no match, an empty dictionary is returned. 6298 """ 6299 6300 config_default = { 6301 "calculations": { 6302 "variant_chr_pos_alt_ref": { 6303 "type": "sql", 6304 "name": "variant_chr_pos_alt_ref", 6305 "description": "Create a variant ID with chromosome, position, alt and ref", 6306 "available": False, 6307 "output_column_name": "variant_chr_pos_alt_ref", 6308 "output_column_type": "String", 6309 "output_column_description": "variant ID with chromosome, position, alt and ref", 6310 "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """, 6311 "operation_info": True, 6312 }, 6313 "VARTYPE": { 6314 "type": "sql", 6315 "name": "VARTYPE", 6316 "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)", 6317 "available": True, 6318 "output_column_name": "VARTYPE", 6319 "output_column_type": "String", 6320 "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ", 6321 "operation_query": """ 6322 CASE 6323 WHEN "SVTYPE" NOT NULL THEN "SVTYPE" 6324 WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV' 6325 WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC' 6326 WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV' 6327 WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL' 6328 ELSE 'UNDEFINED' 6329 END 6330 """, 6331 "info_fields": ["SVTYPE"], 6332 "operation_info": True, 6333 }, 6334 "snpeff_hgvs": { 6335 "type": "python", 6336 "name": "snpeff_hgvs", 6337 "description": "HGVS nomenclatures from snpEff annotation", 6338 "available": True, 6339 "function_name": "calculation_extract_snpeff_hgvs", 6340 "function_params": ["snpeff_hgvs", "ANN"], 6341 }, 6342 "snpeff_ann_explode": { 6343 "type": "python", 6344 "name": "snpeff_ann_explode", 6345 "description": "Explode snpEff annotations with uniquify values", 6346 "available": True, 6347 "function_name": "calculation_snpeff_ann_explode", 6348 "function_params": [False, "fields", "snpeff_", "ANN"], 6349 }, 6350 "snpeff_ann_explode_uniquify": { 6351 "type": "python", 6352 "name": "snpeff_ann_explode_uniquify", 6353 "description": "Explode snpEff annotations", 6354 "available": True, 6355 "function_name": "calculation_snpeff_ann_explode", 6356 "function_params": [True, "fields", "snpeff_uniquify_", "ANN"], 6357 }, 6358 "snpeff_ann_explode_json": { 6359 "type": "python", 6360 "name": "snpeff_ann_explode_json", 6361 "description": "Explode snpEff annotations in JSON format", 6362 "available": True, 6363 "function_name": "calculation_snpeff_ann_explode", 6364 "function_params": [False, "JSON", "snpeff_json", "ANN"], 6365 }, 6366 "NOMEN": { 6367 "type": "python", 6368 "name": "NOMEN", 6369 "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field", 6370 "available": True, 6371 "function_name": "calculation_extract_nomen", 6372 "function_params": [], 6373 }, 6374 "FINDBYPIPELINE": { 6375 "type": "python", 6376 "name": "FINDBYPIPELINE", 6377 "description": "Number of pipeline that identify the variant (for multi pipeline VCF)", 6378 "available": True, 6379 "function_name": "calculation_find_by_pipeline", 6380 "function_params": ["findbypipeline"], 6381 }, 6382 "FINDBYSAMPLE": { 6383 "type": "python", 6384 "name": "FINDBYSAMPLE", 6385 "description": "Number of sample that have a genotype for the variant (for multi sample VCF)", 6386 "available": True, 6387 "function_name": "calculation_find_by_pipeline", 6388 "function_params": ["findbysample"], 6389 }, 6390 "GENOTYPECONCORDANCE": { 6391 "type": "python", 6392 "name": "GENOTYPECONCORDANCE", 6393 "description": "Concordance of genotype for multi caller VCF", 6394 "available": True, 6395 "function_name": "calculation_genotype_concordance", 6396 "function_params": [], 6397 }, 6398 "BARCODE": { 6399 "type": "python", 6400 "name": "BARCODE", 6401 "description": "BARCODE as VaRank tool", 6402 "available": True, 6403 "function_name": "calculation_barcode", 6404 "function_params": [], 6405 }, 6406 "BARCODEFAMILY": { 6407 "type": "python", 6408 "name": "BARCODEFAMILY", 6409 "description": "BARCODEFAMILY as VaRank tool", 6410 "available": True, 6411 "function_name": "calculation_barcode_family", 6412 "function_params": ["BCF"], 6413 }, 6414 "TRIO": { 6415 "type": "python", 6416 "name": "TRIO", 6417 "description": "Inheritance for a trio family", 6418 "available": True, 6419 "function_name": "calculation_trio", 6420 "function_params": [], 6421 }, 6422 "VAF": { 6423 "type": "python", 6424 "name": "VAF", 6425 "description": "Variant Allele Frequency (VAF) harmonization", 6426 "available": True, 6427 "function_name": "calculation_vaf_normalization", 6428 "function_params": [], 6429 }, 6430 "VAF_stats": { 6431 "type": "python", 6432 "name": "VAF_stats", 6433 "description": "Variant Allele Frequency (VAF) statistics", 6434 "available": True, 6435 "function_name": "calculation_genotype_stats", 6436 "function_params": ["VAF"], 6437 }, 6438 "DP_stats": { 6439 "type": "python", 6440 "name": "DP_stats", 6441 "description": "Depth (DP) statistics", 6442 "available": True, 6443 "function_name": "calculation_genotype_stats", 6444 "function_params": ["DP"], 6445 }, 6446 "variant_id": { 6447 "type": "python", 6448 "name": "variant_id", 6449 "description": "Variant ID generated from variant position and type", 6450 "available": True, 6451 "function_name": "calculation_variant_id", 6452 "function_params": [], 6453 }, 6454 "transcripts_json": { 6455 "type": "python", 6456 "name": "transcripts_json", 6457 "description": "Add transcripts info in JSON format (field 'transcripts_json')", 6458 "available": True, 6459 "function_name": "calculation_transcripts_json", 6460 "function_params": ["transcripts_json"], 6461 }, 6462 "transcripts_prioritization": { 6463 "type": "python", 6464 "name": "transcripts_prioritization", 6465 "description": "Prioritize transcripts with a prioritization profile (using param.json)", 6466 "available": True, 6467 "function_name": "calculation_transcripts_prioritization", 6468 "function_params": [], 6469 }, 6470 }, 6471 "prioritizations": { 6472 "default": { 6473 "filter": [ 6474 { 6475 "type": "notequals", 6476 "value": "!PASS|\\.", 6477 "score": 0, 6478 "flag": "FILTERED", 6479 "comment": ["Bad variant quality"], 6480 }, 6481 { 6482 "type": "equals", 6483 "value": "REJECT", 6484 "score": -20, 6485 "flag": "PASS", 6486 "comment": ["Bad variant quality"], 6487 }, 6488 ], 6489 "DP": [ 6490 { 6491 "type": "gte", 6492 "value": "50", 6493 "score": 5, 6494 "flag": "PASS", 6495 "comment": ["DP higher than 50"], 6496 } 6497 ], 6498 "ANN": [ 6499 { 6500 "type": "contains", 6501 "value": "HIGH", 6502 "score": 5, 6503 "flag": "PASS", 6504 "comment": [ 6505 "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay" 6506 ], 6507 }, 6508 { 6509 "type": "contains", 6510 "value": "MODERATE", 6511 "score": 3, 6512 "flag": "PASS", 6513 "comment": [ 6514 "A non-disruptive variant that might change protein effectiveness" 6515 ], 6516 }, 6517 { 6518 "type": "contains", 6519 "value": "LOW", 6520 "score": 0, 6521 "flag": "FILTERED", 6522 "comment": [ 6523 "Assumed to be mostly harmless or unlikely to change protein behavior" 6524 ], 6525 }, 6526 { 6527 "type": "contains", 6528 "value": "MODIFIER", 6529 "score": 0, 6530 "flag": "FILTERED", 6531 "comment": [ 6532 "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact" 6533 ], 6534 }, 6535 ], 6536 } 6537 }, 6538 } 6539 6540 return config_default.get(name, None) 6541 6542 def get_config_json( 6543 self, name: str, config_dict: dict = {}, config_file: str = None 6544 ) -> dict: 6545 """ 6546 The function `get_config_json` retrieves a configuration JSON object with prioritizations from 6547 default values, a dictionary, and a file. 6548 6549 :param name: The `name` parameter in the `get_config_json` function is a string that represents 6550 the name of the configuration. It is used to identify and retrieve the configuration settings 6551 for a specific component or module 6552 :type name: str 6553 :param config_dict: The `config_dict` parameter in the `get_config_json` function is a 6554 dictionary that allows you to provide additional configuration settings or overrides. When you 6555 call the `get_config_json` function, you can pass a dictionary containing key-value pairs where 6556 the key is the configuration setting you want to override or 6557 :type config_dict: dict 6558 :param config_file: The `config_file` parameter in the `get_config_json` function is used to 6559 specify the path to a configuration file that contains additional settings. If provided, the 6560 function will read the contents of this file and update the configuration dictionary with the 6561 values found in the file, overriding any existing values with the 6562 :type config_file: str 6563 :return: The function `get_config_json` returns a dictionary containing the configuration 6564 settings. 6565 """ 6566 6567 # Create with default prioritizations 6568 config_default = self.get_config_default(name=name) 6569 configuration = config_default 6570 # log.debug(f"configuration={configuration}") 6571 6572 # Replace prioritizations from dict 6573 for config in config_dict: 6574 configuration[config] = config_dict[config] 6575 6576 # Replace prioritizations from file 6577 config_file = full_path(config_file) 6578 if config_file: 6579 if os.path.exists(config_file): 6580 with open(config_file) as config_file_content: 6581 config_file_dict = json.load(config_file_content) 6582 for config in config_file_dict: 6583 configuration[config] = config_file_dict[config] 6584 else: 6585 msg_error = f"Config '{name}' file '{config_file}' does NOT exist" 6586 log.error(msg_error) 6587 raise ValueError(msg_error) 6588 6589 return configuration 6590 6591 def prioritization( 6592 self, table: str = None, pz_prefix: str = None, pz_param: dict = None 6593 ) -> bool: 6594 """ 6595 The `prioritization` function in Python processes VCF files, adds new INFO fields, and 6596 prioritizes variants based on configured profiles and criteria. 6597 6598 :param table: The `table` parameter in the `prioritization` function is used to specify the name 6599 of the table (presumably a VCF file) on which the prioritization operation will be performed. If 6600 a table name is provided, the method will prioritize the variants in that specific table 6601 :type table: str 6602 :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to 6603 certain INFO fields in a VCF file during the prioritization process. If this parameter is not 6604 provided, the code will use a default prefix value of "PZ" 6605 :type pz_prefix: str 6606 :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass 6607 additional parameters specific to the prioritization process. These parameters can include 6608 settings related to prioritization profiles, fields, scoring modes, flags, comments, and other 6609 configurations needed for the prioritization of variants in a V 6610 :type pz_param: dict 6611 :return: A boolean value (True) is being returned from the `prioritization` function. 6612 """ 6613 6614 # Config 6615 config = self.get_config() 6616 6617 # Param 6618 param = self.get_param() 6619 6620 # Prioritization param 6621 if pz_param is not None: 6622 prioritization_param = pz_param 6623 else: 6624 prioritization_param = param.get("prioritization", {}) 6625 6626 # Configuration profiles 6627 prioritization_config_file = prioritization_param.get( 6628 "prioritization_config", None 6629 ) 6630 prioritization_config_file = full_path(prioritization_config_file) 6631 prioritizations_config = self.get_config_json( 6632 name="prioritizations", config_file=prioritization_config_file 6633 ) 6634 6635 # Prioritization prefix 6636 pz_prefix_default = "PZ" 6637 if pz_prefix is None: 6638 pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default) 6639 6640 # Prioritization options 6641 profiles = prioritization_param.get("profiles", []) 6642 if isinstance(profiles, str): 6643 profiles = profiles.split(",") 6644 pzfields = prioritization_param.get( 6645 "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"] 6646 ) 6647 if isinstance(pzfields, str): 6648 pzfields = pzfields.split(",") 6649 default_profile = prioritization_param.get("default_profile", None) 6650 pzfields_sep = prioritization_param.get("pzfields_sep", "_") 6651 prioritization_score_mode = prioritization_param.get( 6652 "prioritization_score_mode", "HOWARD" 6653 ) 6654 6655 # Quick Prioritizations 6656 prioritizations = param.get("prioritizations", None) 6657 if prioritizations: 6658 log.info("Quick Prioritization:") 6659 for profile in prioritizations.split(","): 6660 if profile not in profiles: 6661 profiles.append(profile) 6662 log.info(f" {profile}") 6663 6664 # If profile "ALL" provided, all profiles in the config profiles 6665 if "ALL" in profiles: 6666 profiles = list(prioritizations_config.keys()) 6667 6668 for profile in profiles: 6669 if prioritizations_config.get(profile, None): 6670 log.debug(f"Profile '{profile}' configured") 6671 else: 6672 msg_error = f"Profile '{profile}' NOT configured" 6673 log.error(msg_error) 6674 raise ValueError(msg_error) 6675 6676 if profiles: 6677 log.info(f"Prioritization... ") 6678 else: 6679 log.debug(f"No profile defined") 6680 return False 6681 6682 if not default_profile and len(profiles): 6683 default_profile = profiles[0] 6684 6685 log.debug("Profiles availables: " + str(list(prioritizations_config.keys()))) 6686 log.debug("Profiles to check: " + str(list(profiles))) 6687 6688 # Variables 6689 if table is not None: 6690 table_variants = table 6691 else: 6692 table_variants = self.get_table_variants(clause="update") 6693 log.debug(f"Table to prioritize: {table_variants}") 6694 6695 # Added columns 6696 added_columns = [] 6697 6698 # Create list of PZfields 6699 # List of PZFields 6700 list_of_pzfields_original = pzfields + [ 6701 pzfield + pzfields_sep + profile 6702 for pzfield in pzfields 6703 for profile in profiles 6704 ] 6705 list_of_pzfields = [] 6706 log.debug(f"{list_of_pzfields_original}") 6707 6708 # Remove existing PZfields to use if exists 6709 for pzfield in list_of_pzfields_original: 6710 if self.get_header().infos.get(pzfield, None) is None: 6711 list_of_pzfields.append(pzfield) 6712 log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF") 6713 else: 6714 log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF") 6715 6716 if list_of_pzfields: 6717 6718 # Explode Infos prefix 6719 explode_infos_prefix = self.get_explode_infos_prefix() 6720 6721 # PZfields tags description 6722 PZfields_INFOS = { 6723 f"{pz_prefix}Tags": { 6724 "ID": f"{pz_prefix}Tags", 6725 "Number": ".", 6726 "Type": "String", 6727 "Description": "Variant tags based on annotation criteria", 6728 }, 6729 f"{pz_prefix}Score": { 6730 "ID": f"{pz_prefix}Score", 6731 "Number": 1, 6732 "Type": "Integer", 6733 "Description": "Variant score based on annotation criteria", 6734 }, 6735 f"{pz_prefix}Flag": { 6736 "ID": f"{pz_prefix}Flag", 6737 "Number": 1, 6738 "Type": "String", 6739 "Description": "Variant flag based on annotation criteria", 6740 }, 6741 f"{pz_prefix}Comment": { 6742 "ID": f"{pz_prefix}Comment", 6743 "Number": ".", 6744 "Type": "String", 6745 "Description": "Variant comment based on annotation criteria", 6746 }, 6747 f"{pz_prefix}Infos": { 6748 "ID": f"{pz_prefix}Infos", 6749 "Number": ".", 6750 "Type": "String", 6751 "Description": "Variant infos based on annotation criteria", 6752 }, 6753 } 6754 6755 # Create INFO fields if not exist 6756 for field in PZfields_INFOS: 6757 field_ID = PZfields_INFOS[field]["ID"] 6758 field_description = PZfields_INFOS[field]["Description"] 6759 if field_ID not in self.get_header().infos and field_ID in pzfields: 6760 field_description = ( 6761 PZfields_INFOS[field]["Description"] 6762 + f", profile {default_profile}" 6763 ) 6764 self.get_header().infos[field_ID] = vcf.parser._Info( 6765 field_ID, 6766 PZfields_INFOS[field]["Number"], 6767 PZfields_INFOS[field]["Type"], 6768 field_description, 6769 "unknown", 6770 "unknown", 6771 code_type_map[PZfields_INFOS[field]["Type"]], 6772 ) 6773 6774 # Create INFO fields if not exist for each profile 6775 for profile in prioritizations_config: 6776 if profile in profiles or profiles == []: 6777 for field in PZfields_INFOS: 6778 field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile 6779 field_description = ( 6780 PZfields_INFOS[field]["Description"] 6781 + f", profile {profile}" 6782 ) 6783 if ( 6784 field_ID not in self.get_header().infos 6785 and field in pzfields 6786 ): 6787 self.get_header().infos[field_ID] = vcf.parser._Info( 6788 field_ID, 6789 PZfields_INFOS[field]["Number"], 6790 PZfields_INFOS[field]["Type"], 6791 field_description, 6792 "unknown", 6793 "unknown", 6794 code_type_map[PZfields_INFOS[field]["Type"]], 6795 ) 6796 6797 # Header 6798 for pzfield in list_of_pzfields: 6799 if re.match(f"{pz_prefix}Score.*", pzfield): 6800 added_column = self.add_column( 6801 table_name=table_variants, 6802 column_name=pzfield, 6803 column_type="INTEGER", 6804 default_value="0", 6805 ) 6806 elif re.match(f"{pz_prefix}Flag.*", pzfield): 6807 added_column = self.add_column( 6808 table_name=table_variants, 6809 column_name=pzfield, 6810 column_type="BOOLEAN", 6811 default_value="1", 6812 ) 6813 else: 6814 added_column = self.add_column( 6815 table_name=table_variants, 6816 column_name=pzfield, 6817 column_type="STRING", 6818 default_value="''", 6819 ) 6820 added_columns.append(added_column) 6821 6822 # Profiles 6823 if profiles: 6824 6825 # foreach profile in configuration file 6826 for profile in prioritizations_config: 6827 6828 # If profile is asked in param, or ALL are asked (empty profile []) 6829 if profile in profiles or profiles == []: 6830 log.info(f"Profile '{profile}'") 6831 6832 sql_set_info_option = "" 6833 6834 sql_set_info = [] 6835 6836 # PZ fields set 6837 6838 # PZScore 6839 if ( 6840 f"{pz_prefix}Score{pzfields_sep}{profile}" 6841 in list_of_pzfields 6842 ): 6843 sql_set_info.append( 6844 f""" 6845 concat( 6846 '{pz_prefix}Score{pzfields_sep}{profile}=', 6847 {pz_prefix}Score{pzfields_sep}{profile} 6848 ) 6849 """ 6850 ) 6851 if ( 6852 profile == default_profile 6853 and f"{pz_prefix}Score" in list_of_pzfields 6854 ): 6855 sql_set_info.append( 6856 f""" 6857 concat( 6858 '{pz_prefix}Score=', 6859 {pz_prefix}Score{pzfields_sep}{profile} 6860 ) 6861 """ 6862 ) 6863 6864 # PZFlag 6865 if ( 6866 f"{pz_prefix}Flag{pzfields_sep}{profile}" 6867 in list_of_pzfields 6868 ): 6869 sql_set_info.append( 6870 f""" 6871 concat( 6872 '{pz_prefix}Flag{pzfields_sep}{profile}=', 6873 CASE 6874 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 6875 THEN 'PASS' 6876 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 6877 THEN 'FILTERED' 6878 END 6879 ) 6880 """ 6881 ) 6882 if ( 6883 profile == default_profile 6884 and f"{pz_prefix}Flag" in list_of_pzfields 6885 ): 6886 sql_set_info.append( 6887 f""" 6888 concat( 6889 '{pz_prefix}Flag=', 6890 CASE 6891 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 6892 THEN 'PASS' 6893 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 6894 THEN 'FILTERED' 6895 END 6896 ) 6897 """ 6898 ) 6899 6900 # PZComment 6901 if ( 6902 f"{pz_prefix}Comment{pzfields_sep}{profile}" 6903 in list_of_pzfields 6904 ): 6905 sql_set_info.append( 6906 f""" 6907 CASE 6908 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 6909 THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile}) 6910 ELSE '' 6911 END 6912 """ 6913 ) 6914 if ( 6915 profile == default_profile 6916 and f"{pz_prefix}Comment" in list_of_pzfields 6917 ): 6918 sql_set_info.append( 6919 f""" 6920 CASE 6921 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 6922 THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile}) 6923 ELSE '' 6924 END 6925 """ 6926 ) 6927 6928 # PZInfos 6929 if ( 6930 f"{pz_prefix}Infos{pzfields_sep}{profile}" 6931 in list_of_pzfields 6932 ): 6933 sql_set_info.append( 6934 f""" 6935 CASE 6936 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 6937 THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile}) 6938 ELSE '' 6939 END 6940 """ 6941 ) 6942 if ( 6943 profile == default_profile 6944 and f"{pz_prefix}Infos" in list_of_pzfields 6945 ): 6946 sql_set_info.append( 6947 f""" 6948 CASE 6949 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 6950 THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile}) 6951 ELSE '' 6952 END 6953 """ 6954 ) 6955 6956 # Merge PZfields 6957 sql_set_info_option = "" 6958 sql_set_sep = "" 6959 for sql_set in sql_set_info: 6960 if sql_set_sep: 6961 sql_set_info_option += f""" 6962 , concat('{sql_set_sep}', {sql_set}) 6963 """ 6964 else: 6965 sql_set_info_option += f""" 6966 , {sql_set} 6967 """ 6968 sql_set_sep = ";" 6969 6970 sql_queries = [] 6971 for annotation in prioritizations_config[profile]: 6972 6973 # Explode specific annotation 6974 log.debug(f"Explode annotation '{annotation}'") 6975 added_columns += self.explode_infos( 6976 prefix=explode_infos_prefix, 6977 fields=[annotation], 6978 table=table_variants, 6979 ) 6980 extra_infos = self.get_extra_infos(table=table_variants) 6981 6982 # Check if annotation field is present 6983 if not f"{explode_infos_prefix}{annotation}" in extra_infos: 6984 log.debug(f"Annotation '{annotation}' not in data") 6985 continue 6986 else: 6987 log.debug(f"Annotation '{annotation}' in data") 6988 6989 # For each criterions 6990 for criterion in prioritizations_config[profile][ 6991 annotation 6992 ]: 6993 criterion_type = criterion["type"] 6994 criterion_value = criterion["value"] 6995 criterion_score = criterion.get("score", 0) 6996 criterion_flag = criterion.get("flag", "PASS") 6997 criterion_flag_bool = criterion_flag == "PASS" 6998 criterion_comment = ( 6999 ", ".join(criterion.get("comment", [])) 7000 .replace("'", "''") 7001 .replace(";", ",") 7002 .replace("\t", " ") 7003 ) 7004 criterion_infos = ( 7005 str(criterion) 7006 .replace("'", "''") 7007 .replace(";", ",") 7008 .replace("\t", " ") 7009 ) 7010 7011 sql_set = [] 7012 sql_set_info = [] 7013 7014 # PZ fields set 7015 if ( 7016 f"{pz_prefix}Score{pzfields_sep}{profile}" 7017 in list_of_pzfields 7018 ): 7019 if prioritization_score_mode == "HOWARD": 7020 sql_set.append( 7021 f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7022 ) 7023 elif prioritization_score_mode == "VaRank": 7024 sql_set.append( 7025 f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} END" 7026 ) 7027 else: 7028 sql_set.append( 7029 f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7030 ) 7031 if ( 7032 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7033 in list_of_pzfields 7034 ): 7035 sql_set.append( 7036 f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}" 7037 ) 7038 if ( 7039 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7040 in list_of_pzfields 7041 ): 7042 sql_set.append( 7043 f""" 7044 {pz_prefix}Comment{pzfields_sep}{profile} = 7045 concat( 7046 {pz_prefix}Comment{pzfields_sep}{profile}, 7047 CASE 7048 WHEN {pz_prefix}Comment{pzfields_sep}{profile}!='' 7049 THEN ', ' 7050 ELSE '' 7051 END, 7052 '{criterion_comment}' 7053 ) 7054 """ 7055 ) 7056 if ( 7057 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7058 in list_of_pzfields 7059 ): 7060 sql_set.append( 7061 f""" 7062 {pz_prefix}Infos{pzfields_sep}{profile} = 7063 concat( 7064 {pz_prefix}Infos{pzfields_sep}{profile}, 7065 '{criterion_infos}' 7066 ) 7067 """ 7068 ) 7069 sql_set_option = ",".join(sql_set) 7070 7071 # Criterion and comparison 7072 if sql_set_option: 7073 try: 7074 float(criterion_value) 7075 sql_update = f""" 7076 UPDATE {table_variants} 7077 SET {sql_set_option} 7078 WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.') 7079 AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value} 7080 """ 7081 except: 7082 contains_option = "" 7083 if criterion_type == "contains": 7084 contains_option = ".*" 7085 sql_update = f""" 7086 UPDATE {table_variants} 7087 SET {sql_set_option} 7088 WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}' 7089 """ 7090 sql_queries.append(sql_update) 7091 else: 7092 log.warning( 7093 f"NO SQL SET option for '{annotation}' - '{criterion}'" 7094 ) 7095 7096 # PZTags 7097 if ( 7098 f"{pz_prefix}Tags{pzfields_sep}{profile}" 7099 in list_of_pzfields 7100 ): 7101 7102 # Create PZFalgs value 7103 pztags_value = "" 7104 pztags_sep_default = "|" 7105 pztags_sep = "" 7106 for pzfield in pzfields: 7107 if pzfield not in [f"{pz_prefix}Tags"]: 7108 if ( 7109 f"{pzfield}{pzfields_sep}{profile}" 7110 in list_of_pzfields 7111 ): 7112 if pzfield in [f"{pz_prefix}Flag"]: 7113 pztags_value += f"""{pztags_sep}{pzfield}#', 7114 CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile} 7115 THEN 'PASS' 7116 ELSE 'FILTERED' 7117 END, '""" 7118 else: 7119 pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '" 7120 pztags_sep = pztags_sep_default 7121 7122 # Add Query update for PZFlags 7123 sql_update_pztags = f""" 7124 UPDATE {table_variants} 7125 SET INFO = concat( 7126 INFO, 7127 CASE WHEN INFO NOT in ('','.') 7128 THEN ';' 7129 ELSE '' 7130 END, 7131 '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}' 7132 ) 7133 """ 7134 sql_queries.append(sql_update_pztags) 7135 7136 # Add Query update for PZFlags for default 7137 if profile == default_profile: 7138 sql_update_pztags_default = f""" 7139 UPDATE {table_variants} 7140 SET INFO = concat( 7141 INFO, 7142 ';', 7143 '{pz_prefix}Tags={pztags_value}' 7144 ) 7145 """ 7146 sql_queries.append(sql_update_pztags_default) 7147 7148 log.info(f"""Profile '{profile}' - Prioritization... """) 7149 7150 if sql_queries: 7151 7152 for sql_query in sql_queries: 7153 log.debug( 7154 f"""Profile '{profile}' - Prioritization query: {sql_query}... """ 7155 ) 7156 self.conn.execute(sql_query) 7157 7158 log.info(f"""Profile '{profile}' - Update... """) 7159 sql_query_update = f""" 7160 UPDATE {table_variants} 7161 SET INFO = 7162 concat( 7163 CASE 7164 WHEN INFO NOT IN ('','.') 7165 THEN concat(INFO, ';') 7166 ELSE '' 7167 END 7168 {sql_set_info_option} 7169 ) 7170 """ 7171 self.conn.execute(sql_query_update) 7172 7173 else: 7174 7175 log.warning(f"No profiles in parameters") 7176 7177 # Remove added columns 7178 for added_column in added_columns: 7179 self.drop_column(column=added_column) 7180 7181 # Explode INFOS fields into table fields 7182 if self.get_explode_infos(): 7183 self.explode_infos( 7184 prefix=self.get_explode_infos_prefix(), 7185 fields=self.get_explode_infos_fields(), 7186 force=True, 7187 ) 7188 7189 return True 7190 7191 ### 7192 # HGVS 7193 ### 7194 7195 def annotation_hgvs(self, threads: int = None) -> None: 7196 """ 7197 The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic 7198 coordinates and alleles. 7199 7200 :param threads: The `threads` parameter is an optional integer that specifies the number of 7201 threads to use for parallel processing. If no value is provided, it will default to the number 7202 of threads obtained from the `get_threads()` method 7203 :type threads: int 7204 """ 7205 7206 # Function for each partition of the Dask Dataframe 7207 def partition_function(partition): 7208 """ 7209 The function `partition_function` applies the `annotation_hgvs_partition` function to 7210 each row of a DataFrame called `partition`. 7211 7212 :param partition: The parameter "partition" is a pandas DataFrame that contains the data 7213 to be processed 7214 :return: the result of applying the "annotation_hgvs_partition" function to each row of 7215 the "partition" dataframe along the axis 1. 7216 """ 7217 return partition.apply(annotation_hgvs_partition, axis=1) 7218 7219 def annotation_hgvs_partition(row) -> str: 7220 """ 7221 The function `annotation_hgvs_partition` takes in a row of data and returns a string 7222 containing a list of HGVS names associated with the given genomic coordinates and alleles. 7223 7224 :param row: A dictionary-like object that contains the values for the following keys: 7225 :return: a string that contains the HGVS names associated with the given row of data. 7226 """ 7227 7228 chr = row["CHROM"] 7229 pos = row["POS"] 7230 ref = row["REF"] 7231 alt = row["ALT"] 7232 7233 # Find list of associated transcripts 7234 transcripts_list = list( 7235 polars_conn.execute( 7236 f""" 7237 SELECT transcript 7238 FROM refseq_df 7239 WHERE CHROM='{chr}' 7240 AND POS={pos} 7241 """ 7242 )["transcript"] 7243 ) 7244 7245 # Full HGVS annotation in list 7246 hgvs_full_list = [] 7247 7248 for transcript_name in transcripts_list: 7249 7250 # Transcript 7251 transcript = get_transcript( 7252 transcripts=transcripts, transcript_name=transcript_name 7253 ) 7254 # Exon 7255 if use_exon: 7256 exon = transcript.find_exon_number(pos) 7257 else: 7258 exon = None 7259 # Protein 7260 transcript_protein = None 7261 if use_protein or add_protein or full_format: 7262 transcripts_protein = list( 7263 polars_conn.execute( 7264 f""" 7265 SELECT protein 7266 FROM refseqlink_df 7267 WHERE transcript='{transcript_name}' 7268 LIMIT 1 7269 """ 7270 )["protein"] 7271 ) 7272 if len(transcripts_protein): 7273 transcript_protein = transcripts_protein[0] 7274 7275 # HGVS name 7276 hgvs_name = format_hgvs_name( 7277 chr, 7278 pos, 7279 ref, 7280 alt, 7281 genome=genome, 7282 transcript=transcript, 7283 transcript_protein=transcript_protein, 7284 exon=exon, 7285 use_gene=use_gene, 7286 use_protein=use_protein, 7287 full_format=full_format, 7288 use_version=use_version, 7289 codon_type=codon_type, 7290 ) 7291 hgvs_full_list.append(hgvs_name) 7292 if add_protein and not use_protein and not full_format: 7293 hgvs_name = format_hgvs_name( 7294 chr, 7295 pos, 7296 ref, 7297 alt, 7298 genome=genome, 7299 transcript=transcript, 7300 transcript_protein=transcript_protein, 7301 exon=exon, 7302 use_gene=use_gene, 7303 use_protein=True, 7304 full_format=False, 7305 use_version=use_version, 7306 codon_type=codon_type, 7307 ) 7308 hgvs_full_list.append(hgvs_name) 7309 7310 # Create liste of HGVS annotations 7311 hgvs_full = ",".join(hgvs_full_list) 7312 7313 return hgvs_full 7314 7315 # Polars connexion 7316 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7317 7318 # Config 7319 config = self.get_config() 7320 7321 # Databases 7322 # Genome 7323 databases_genomes_folders = ( 7324 config.get("folders", {}) 7325 .get("databases", {}) 7326 .get("genomes", DEFAULT_GENOME_FOLDER) 7327 ) 7328 databases_genome = ( 7329 config.get("folders", {}).get("databases", {}).get("genomes", "") 7330 ) 7331 # refseq database folder 7332 databases_refseq_folders = ( 7333 config.get("folders", {}) 7334 .get("databases", {}) 7335 .get("refseq", DEFAULT_REFSEQ_FOLDER) 7336 ) 7337 # refseq 7338 databases_refseq = config.get("databases", {}).get("refSeq", None) 7339 # refSeqLink 7340 databases_refseqlink = config.get("databases", {}).get("refSeqLink", None) 7341 7342 # Param 7343 param = self.get_param() 7344 7345 # Quick HGVS 7346 if "hgvs_options" in param and param.get("hgvs_options", ""): 7347 log.info(f"Quick HGVS Annotation:") 7348 if not param.get("hgvs", None): 7349 param["hgvs"] = {} 7350 for option in param.get("hgvs_options", "").split(","): 7351 option_var_val = option.split("=") 7352 option_var = option_var_val[0] 7353 if len(option_var_val) > 1: 7354 option_val = option_var_val[1] 7355 else: 7356 option_val = "True" 7357 if option_val.upper() in ["TRUE"]: 7358 option_val = True 7359 elif option_val.upper() in ["FALSE"]: 7360 option_val = False 7361 log.info(f" {option_var}={option_val}") 7362 param["hgvs"][option_var] = option_val 7363 7364 # Check if HGVS annotation enabled 7365 if "hgvs" in param: 7366 log.info(f"HGVS Annotation... ") 7367 for hgvs_option in param.get("hgvs", {}): 7368 log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}") 7369 else: 7370 return 7371 7372 # HGVS Param 7373 param_hgvs = param.get("hgvs", {}) 7374 use_exon = param_hgvs.get("use_exon", False) 7375 use_gene = param_hgvs.get("use_gene", False) 7376 use_protein = param_hgvs.get("use_protein", False) 7377 add_protein = param_hgvs.get("add_protein", False) 7378 full_format = param_hgvs.get("full_format", False) 7379 use_version = param_hgvs.get("use_version", False) 7380 codon_type = param_hgvs.get("codon_type", "3") 7381 7382 # refSseq refSeqLink 7383 databases_refseq = param_hgvs.get("refseq", databases_refseq) 7384 databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink) 7385 7386 # Assembly 7387 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 7388 7389 # Genome 7390 genome_file = None 7391 if find_genome(databases_genome): 7392 genome_file = find_genome(databases_genome) 7393 else: 7394 genome_file = find_genome( 7395 genome_path=databases_genomes_folders, assembly=assembly 7396 ) 7397 log.debug("Genome: " + str(genome_file)) 7398 7399 # refSseq 7400 refseq_file = find_file_prefix( 7401 input_file=databases_refseq, 7402 prefix="ncbiRefSeq", 7403 folder=databases_refseq_folders, 7404 assembly=assembly, 7405 ) 7406 log.debug("refSeq: " + str(refseq_file)) 7407 7408 # refSeqLink 7409 refseqlink_file = find_file_prefix( 7410 input_file=databases_refseqlink, 7411 prefix="ncbiRefSeqLink", 7412 folder=databases_refseq_folders, 7413 assembly=assembly, 7414 ) 7415 log.debug("refSeqLink: " + str(refseqlink_file)) 7416 7417 # Threads 7418 if not threads: 7419 threads = self.get_threads() 7420 log.debug("Threads: " + str(threads)) 7421 7422 # Variables 7423 table_variants = self.get_table_variants(clause="update") 7424 7425 # Get variants SNV and InDel only 7426 query_variants = f""" 7427 SELECT "#CHROM" AS CHROM, POS, REF, ALT 7428 FROM {table_variants} 7429 WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$' 7430 """ 7431 df_variants = self.get_query_to_df(query_variants) 7432 7433 # Added columns 7434 added_columns = [] 7435 7436 # Add hgvs column in variants table 7437 hgvs_column_name = "hgvs_" + str(random.randrange(1000)) 7438 added_column = self.add_column( 7439 table_variants, hgvs_column_name, "STRING", default_value=None 7440 ) 7441 added_columns.append(added_column) 7442 7443 log.debug(f"refSeq loading...") 7444 # refSeq in duckDB 7445 refseq_table = get_refseq_table( 7446 conn=self.conn, refseq_table="refseq", refseq_file=refseq_file 7447 ) 7448 # Loading all refSeq in Dataframe 7449 refseq_query = f""" 7450 SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript 7451 FROM {refseq_table} 7452 JOIN df_variants ON ( 7453 {refseq_table}.chrom = df_variants.CHROM 7454 AND {refseq_table}.txStart<=df_variants.POS 7455 AND {refseq_table}.txEnd>=df_variants.POS 7456 ) 7457 """ 7458 refseq_df = self.conn.query(refseq_query).pl() 7459 7460 if refseqlink_file: 7461 log.debug(f"refSeqLink loading...") 7462 # refSeqLink in duckDB 7463 refseqlink_table = get_refseq_table( 7464 conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file 7465 ) 7466 # Loading all refSeqLink in Dataframe 7467 protacc_column = "protAcc_with_ver" 7468 mrnaacc_column = "mrnaAcc_with_ver" 7469 refseqlink_query = f""" 7470 SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript 7471 FROM {refseqlink_table} 7472 JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver) 7473 WHERE protAcc_without_ver IS NOT NULL 7474 """ 7475 # Polars Dataframe 7476 refseqlink_df = self.conn.query(f"{refseqlink_query}").pl() 7477 7478 # Read RefSeq transcripts into a python dict/model. 7479 log.debug(f"Transcripts loading...") 7480 with tempfile.TemporaryDirectory() as tmpdir: 7481 transcripts_query = f""" 7482 COPY ( 7483 SELECT {refseq_table}.* 7484 FROM {refseq_table} 7485 JOIN df_variants ON ( 7486 {refseq_table}.chrom=df_variants.CHROM 7487 AND {refseq_table}.txStart<=df_variants.POS 7488 AND {refseq_table}.txEnd>=df_variants.POS 7489 ) 7490 ) 7491 TO '{tmpdir}/transcript.tsv' (DELIMITER '\t'); 7492 """ 7493 self.conn.query(transcripts_query) 7494 with open(f"{tmpdir}/transcript.tsv") as infile: 7495 transcripts = read_transcripts(infile) 7496 7497 # Polars connexion 7498 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7499 7500 log.debug("Genome loading...") 7501 # Read genome sequence using pyfaidx. 7502 genome = Fasta(genome_file) 7503 7504 log.debug("Start annotation HGVS...") 7505 7506 # Create 7507 # a Dask Dataframe from Pandas dataframe with partition as number of threads 7508 ddf = dd.from_pandas(df_variants, npartitions=threads) 7509 7510 # Use dask.dataframe.apply() to apply function on each partition 7511 ddf[hgvs_column_name] = ddf.map_partitions(partition_function) 7512 7513 # Convert Dask DataFrame to Pandas Dataframe 7514 df = ddf.compute() 7515 7516 # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???) 7517 with tempfile.TemporaryDirectory() as tmpdir: 7518 df_parquet = os.path.join(tmpdir, "df.parquet") 7519 df.to_parquet(df_parquet) 7520 7521 # Update hgvs column 7522 update_variant_query = f""" 7523 UPDATE {table_variants} 7524 SET "{hgvs_column_name}"=df."{hgvs_column_name}" 7525 FROM read_parquet('{df_parquet}') as df 7526 WHERE variants."#CHROM" = df.CHROM 7527 AND variants.POS = df.POS 7528 AND variants.REF = df.REF 7529 AND variants.ALT = df.ALT 7530 AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL 7531 """ 7532 self.execute_query(update_variant_query) 7533 7534 # Update INFO column 7535 sql_query_update = f""" 7536 UPDATE {table_variants} 7537 SET INFO = 7538 concat( 7539 CASE 7540 WHEN INFO NOT IN ('','.') 7541 THEN concat(INFO, ';') 7542 ELSE '' 7543 END, 7544 'hgvs=', 7545 {hgvs_column_name} 7546 ) 7547 WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL 7548 """ 7549 self.execute_query(sql_query_update) 7550 7551 # Add header 7552 HGVS_INFOS = { 7553 "hgvs": { 7554 "ID": "hgvs", 7555 "Number": ".", 7556 "Type": "String", 7557 "Description": f"HGVS annotatation with HOWARD", 7558 } 7559 } 7560 7561 for field in HGVS_INFOS: 7562 field_ID = HGVS_INFOS[field]["ID"] 7563 field_description = HGVS_INFOS[field]["Description"] 7564 self.get_header().infos[field_ID] = vcf.parser._Info( 7565 field_ID, 7566 HGVS_INFOS[field]["Number"], 7567 HGVS_INFOS[field]["Type"], 7568 field_description, 7569 "unknown", 7570 "unknown", 7571 code_type_map[HGVS_INFOS[field]["Type"]], 7572 ) 7573 7574 # Remove added columns 7575 for added_column in added_columns: 7576 self.drop_column(column=added_column) 7577 7578 ### 7579 # Calculation 7580 ### 7581 7582 def get_operations_help( 7583 self, operations_config_dict: dict = {}, operations_config_file: str = None 7584 ) -> list: 7585 7586 # Init 7587 operations_help = [] 7588 7589 # operations 7590 operations = self.get_config_json( 7591 name="calculations", 7592 config_dict=operations_config_dict, 7593 config_file=operations_config_file, 7594 ) 7595 for op in operations: 7596 op_name = operations[op].get("name", op).upper() 7597 op_description = operations[op].get("description", op_name) 7598 op_available = operations[op].get("available", False) 7599 if op_available: 7600 operations_help.append(f" {op_name}: {op_description}") 7601 7602 # Sort operations 7603 operations_help.sort() 7604 7605 # insert header 7606 operations_help.insert(0, "Available calculation operations:") 7607 7608 # Return 7609 return operations_help 7610 7611 def calculation( 7612 self, 7613 operations: dict = {}, 7614 operations_config_dict: dict = {}, 7615 operations_config_file: str = None, 7616 ) -> None: 7617 """ 7618 It takes a list of operations, and for each operation, it checks if it's a python or sql 7619 operation, and then calls the appropriate function 7620 7621 param json example: 7622 "calculation": { 7623 "NOMEN": { 7624 "options": { 7625 "hgvs_field": "hgvs" 7626 }, 7627 "middle" : null 7628 } 7629 """ 7630 7631 # Param 7632 param = self.get_param() 7633 7634 # operations config 7635 operations_config = self.get_config_json( 7636 name="calculations", 7637 config_dict=operations_config_dict, 7638 config_file=operations_config_file, 7639 ) 7640 7641 # Upper keys 7642 operations_config = {k.upper(): v for k, v in operations_config.items()} 7643 7644 # Calculations 7645 7646 # Operations from param 7647 operations = param.get("calculation", {}).get("calculations", operations) 7648 7649 # Quick calculation - add 7650 if param.get("calculations", None): 7651 calculations_list = [ 7652 value for value in param.get("calculations", "").split(",") 7653 ] 7654 log.info(f"Quick Calculations:") 7655 for calculation_key in calculations_list: 7656 log.info(f" {calculation_key}") 7657 for calculation_operation in calculations_list: 7658 if calculation_operation.upper() not in operations: 7659 operations[calculation_operation.upper()] = {} 7660 add_value_into_dict( 7661 dict_tree=param, 7662 sections=[ 7663 "calculation", 7664 "calculations", 7665 calculation_operation.upper(), 7666 ], 7667 value={}, 7668 ) 7669 7670 # Operations for calculation 7671 if not operations: 7672 operations = param.get("calculation", {}).get("calculations", {}) 7673 7674 if operations: 7675 log.info(f"Calculations...") 7676 7677 # For each operations 7678 for operation_name in operations: 7679 operation_name = operation_name.upper() 7680 if operation_name not in [""]: 7681 if operation_name in operations_config: 7682 log.info(f"Calculation '{operation_name}'") 7683 operation = operations_config[operation_name] 7684 operation_type = operation.get("type", "sql") 7685 if operation_type == "python": 7686 self.calculation_process_function( 7687 operation=operation, operation_name=operation_name 7688 ) 7689 elif operation_type == "sql": 7690 self.calculation_process_sql( 7691 operation=operation, operation_name=operation_name 7692 ) 7693 else: 7694 log.error( 7695 f"Operations config: Type '{operation_type}' NOT available" 7696 ) 7697 raise ValueError( 7698 f"Operations config: Type '{operation_type}' NOT available" 7699 ) 7700 else: 7701 log.error( 7702 f"Operations config: Calculation '{operation_name}' NOT available" 7703 ) 7704 raise ValueError( 7705 f"Operations config: Calculation '{operation_name}' NOT available" 7706 ) 7707 7708 # Explode INFOS fields into table fields 7709 if self.get_explode_infos(): 7710 self.explode_infos( 7711 prefix=self.get_explode_infos_prefix(), 7712 fields=self.get_explode_infos_fields(), 7713 force=True, 7714 ) 7715 7716 def calculation_process_sql( 7717 self, operation: dict, operation_name: str = "unknown" 7718 ) -> None: 7719 """ 7720 The `calculation_process_sql` function takes in a mathematical operation as a string and 7721 performs the operation, updating the specified table with the result. 7722 7723 :param operation: The `operation` parameter is a dictionary that contains information about the 7724 mathematical operation to be performed. It includes the following keys: 7725 :type operation: dict 7726 :param operation_name: The `operation_name` parameter is a string that represents the name of 7727 the mathematical operation being performed. It is used for logging and error handling purposes, 7728 defaults to unknown 7729 :type operation_name: str (optional) 7730 """ 7731 7732 # table variants 7733 table_variants = self.get_table_variants(clause="alter") 7734 7735 # Operation infos 7736 operation_name = operation.get("name", "unknown") 7737 log.debug(f"process sql {operation_name}") 7738 output_column_name = operation.get("output_column_name", operation_name) 7739 output_column_type = operation.get("output_column_type", "String") 7740 prefix = operation.get("explode_infos_prefix", "") 7741 output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR") 7742 output_column_description = operation.get( 7743 "output_column_description", f"{operation_name} operation" 7744 ) 7745 operation_query = operation.get("operation_query", None) 7746 if isinstance(operation_query, list): 7747 operation_query = " ".join(operation_query) 7748 operation_info_fields = operation.get("info_fields", []) 7749 operation_info_fields_check = operation.get("info_fields_check", False) 7750 operation_info = operation.get("operation_info", True) 7751 7752 if operation_query: 7753 7754 # Info fields check 7755 operation_info_fields_check_result = True 7756 if operation_info_fields_check: 7757 header_infos = self.get_header().infos 7758 for info_field in operation_info_fields: 7759 operation_info_fields_check_result = ( 7760 operation_info_fields_check_result 7761 and info_field in header_infos 7762 ) 7763 7764 # If info fields available 7765 if operation_info_fields_check_result: 7766 7767 # Added_columns 7768 added_columns = [] 7769 7770 # Create VCF header field 7771 vcf_reader = self.get_header() 7772 vcf_reader.infos[output_column_name] = vcf.parser._Info( 7773 output_column_name, 7774 ".", 7775 output_column_type, 7776 output_column_description, 7777 "howard calculation", 7778 "0", 7779 self.code_type_map.get(output_column_type), 7780 ) 7781 7782 # Explode infos if needed 7783 log.debug(f"calculation_process_sql prefix {prefix}") 7784 added_columns += self.explode_infos( 7785 prefix=prefix, 7786 fields=[output_column_name] + operation_info_fields, 7787 force=True, 7788 ) 7789 7790 # Create column 7791 added_column = self.add_column( 7792 table_name=table_variants, 7793 column_name=prefix + output_column_name, 7794 column_type=output_column_type_sql, 7795 default_value="null", 7796 ) 7797 added_columns.append(added_column) 7798 7799 # Operation calculation 7800 try: 7801 7802 # Query to update calculation column 7803 sql_update = f""" 7804 UPDATE {table_variants} 7805 SET "{prefix}{output_column_name}" = ({operation_query}) 7806 """ 7807 self.conn.execute(sql_update) 7808 7809 # Add to INFO 7810 if operation_info: 7811 sql_update_info = f""" 7812 UPDATE {table_variants} 7813 SET "INFO" = 7814 concat( 7815 CASE 7816 WHEN "INFO" IS NOT NULL 7817 THEN concat("INFO", ';') 7818 ELSE '' 7819 END, 7820 '{output_column_name}=', 7821 "{prefix}{output_column_name}" 7822 ) 7823 WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('') 7824 """ 7825 self.conn.execute(sql_update_info) 7826 7827 except: 7828 log.error( 7829 f"Operations config: Calculation '{operation_name}' query failed" 7830 ) 7831 raise ValueError( 7832 f"Operations config: Calculation '{operation_name}' query failed" 7833 ) 7834 7835 # Remove added columns 7836 for added_column in added_columns: 7837 log.debug(f"added_column: {added_column}") 7838 self.drop_column(column=added_column) 7839 7840 else: 7841 log.error( 7842 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 7843 ) 7844 raise ValueError( 7845 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 7846 ) 7847 7848 else: 7849 log.error( 7850 f"Operations config: Calculation '{operation_name}' query NOT defined" 7851 ) 7852 raise ValueError( 7853 f"Operations config: Calculation '{operation_name}' query NOT defined" 7854 ) 7855 7856 def calculation_process_function( 7857 self, operation: dict, operation_name: str = "unknown" 7858 ) -> None: 7859 """ 7860 The `calculation_process_function` takes in an operation dictionary and performs the specified 7861 function with the given parameters. 7862 7863 :param operation: The `operation` parameter is a dictionary that contains information about the 7864 operation to be performed. It has the following keys: 7865 :type operation: dict 7866 :param operation_name: The `operation_name` parameter is a string that represents the name of 7867 the operation being performed. It is used for logging purposes, defaults to unknown 7868 :type operation_name: str (optional) 7869 """ 7870 7871 operation_name = operation["name"] 7872 log.debug(f"process sql {operation_name}") 7873 function_name = operation["function_name"] 7874 function_params = operation["function_params"] 7875 getattr(self, function_name)(*function_params) 7876 7877 def calculation_variant_id(self) -> None: 7878 """ 7879 The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and 7880 updates the INFO field of a variants table with the variant ID. 7881 """ 7882 7883 # variant_id annotation field 7884 variant_id_tag = self.get_variant_id_column() 7885 added_columns = [variant_id_tag] 7886 7887 # variant_id hgvs tags" 7888 vcf_infos_tags = { 7889 variant_id_tag: "howard variant ID annotation", 7890 } 7891 7892 # Variants table 7893 table_variants = self.get_table_variants() 7894 7895 # Header 7896 vcf_reader = self.get_header() 7897 7898 # Add variant_id to header 7899 vcf_reader.infos[variant_id_tag] = vcf.parser._Info( 7900 variant_id_tag, 7901 ".", 7902 "String", 7903 vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"), 7904 "howard calculation", 7905 "0", 7906 self.code_type_map.get("String"), 7907 ) 7908 7909 # Update 7910 sql_update = f""" 7911 UPDATE {table_variants} 7912 SET "INFO" = 7913 concat( 7914 CASE 7915 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 7916 THEN '' 7917 ELSE concat("INFO", ';') 7918 END, 7919 '{variant_id_tag}=', 7920 "{variant_id_tag}" 7921 ) 7922 """ 7923 self.conn.execute(sql_update) 7924 7925 # Remove added columns 7926 for added_column in added_columns: 7927 self.drop_column(column=added_column) 7928 7929 def calculation_extract_snpeff_hgvs( 7930 self, 7931 snpeff_hgvs: str = "snpeff_hgvs", 7932 snpeff_field: str = "ANN", 7933 ) -> None: 7934 """ 7935 The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff 7936 annotation field in a VCF file and adds them as a new column in the variants table. 7937 7938 :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs` 7939 function is used to specify the name of the column that will store the HGVS nomenclatures 7940 extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to 7941 snpeff_hgvs 7942 :type snpeff_hgvs: str (optional) 7943 :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs` 7944 function represents the field in the VCF file that contains SnpEff annotations. This field is 7945 used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults 7946 to ANN 7947 :type snpeff_field: str (optional) 7948 """ 7949 7950 # Snpeff hgvs tags 7951 vcf_infos_tags = { 7952 snpeff_hgvs: "HGVS nomenclatures from snpEff annotation", 7953 } 7954 7955 # Prefix 7956 prefix = self.get_explode_infos_prefix() 7957 if prefix: 7958 prefix = "INFO/" 7959 7960 # snpEff fields 7961 speff_ann_infos = prefix + snpeff_field 7962 speff_hgvs_infos = prefix + snpeff_hgvs 7963 7964 # Variants table 7965 table_variants = self.get_table_variants() 7966 7967 # Header 7968 vcf_reader = self.get_header() 7969 7970 # Add columns 7971 added_columns = [] 7972 7973 # Explode HGVS field in column 7974 added_columns += self.explode_infos(fields=[snpeff_field]) 7975 7976 if snpeff_field in vcf_reader.infos: 7977 7978 log.debug(vcf_reader.infos[snpeff_field]) 7979 7980 # Extract ANN header 7981 ann_description = vcf_reader.infos[snpeff_field].desc 7982 pattern = r"'(.+?)'" 7983 match = re.search(pattern, ann_description) 7984 if match: 7985 ann_header_match = match.group(1).split(" | ") 7986 ann_header_desc = {} 7987 for i in range(len(ann_header_match)): 7988 ann_header_info = "".join( 7989 char for char in ann_header_match[i] if char.isalnum() 7990 ) 7991 ann_header_desc[ann_header_info] = ann_header_match[i] 7992 if not ann_header_desc: 7993 raise ValueError("Invalid header description format") 7994 else: 7995 raise ValueError("Invalid header description format") 7996 7997 # Create variant id 7998 variant_id_column = self.get_variant_id_column() 7999 added_columns += [variant_id_column] 8000 8001 # Create dataframe 8002 dataframe_snpeff_hgvs = self.get_query_to_df( 8003 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8004 ) 8005 8006 # Create main NOMEN column 8007 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8008 speff_ann_infos 8009 ].apply( 8010 lambda x: extract_snpeff_hgvs( 8011 str(x), header=list(ann_header_desc.values()) 8012 ) 8013 ) 8014 8015 # Add snpeff_hgvs to header 8016 vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info( 8017 snpeff_hgvs, 8018 ".", 8019 "String", 8020 vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"), 8021 "howard calculation", 8022 "0", 8023 self.code_type_map.get("String"), 8024 ) 8025 8026 # Update 8027 sql_update = f""" 8028 UPDATE variants 8029 SET "INFO" = 8030 concat( 8031 CASE 8032 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8033 THEN '' 8034 ELSE concat("INFO", ';') 8035 END, 8036 CASE 8037 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8038 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8039 THEN concat( 8040 '{snpeff_hgvs}=', 8041 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8042 ) 8043 ELSE '' 8044 END 8045 ) 8046 FROM dataframe_snpeff_hgvs 8047 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8048 8049 """ 8050 self.conn.execute(sql_update) 8051 8052 # Delete dataframe 8053 del dataframe_snpeff_hgvs 8054 gc.collect() 8055 8056 else: 8057 8058 log.warning( 8059 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8060 ) 8061 8062 # Remove added columns 8063 for added_column in added_columns: 8064 self.drop_column(column=added_column) 8065 8066 def calculation_snpeff_ann_explode( 8067 self, 8068 uniquify: bool = True, 8069 output_format: str = "fields", 8070 output_prefix: str = "snpeff_", 8071 snpeff_field: str = "ANN", 8072 ) -> None: 8073 """ 8074 The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by 8075 exploding the HGVS field and updating variant information accordingly. 8076 8077 :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a 8078 boolean flag that determines whether the output should be uniquified or not. When set to `True`, 8079 it indicates that the output should be unique, meaning that duplicate entries should be removed, 8080 defaults to True 8081 :type uniquify: bool (optional) 8082 :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode` 8083 function specifies the format in which the output annotations will be generated. It has a 8084 default value of "fields". You can also set it to "JSON" to output the annotations in JSON 8085 format, defaults to fields 8086 :type output_format: str (optional) 8087 :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode` 8088 method is used to specify the prefix that will be added to the output annotations generated 8089 during the calculation process. This prefix helps to differentiate the newly added annotations 8090 from existing ones in the output data. By default, the, defaults to ANN_ 8091 :type output_prefix: str (optional) 8092 :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode` 8093 function is used to specify the field in the VCF file that contains SnpEff annotations. This 8094 field will be processed to explode the HGVS annotations and update the variant information 8095 accordingly, defaults to ANN 8096 :type snpeff_field: str (optional) 8097 """ 8098 8099 # SnpEff annotation field 8100 snpeff_hgvs = "snpeff_ann_explode" 8101 8102 # Snpeff hgvs tags 8103 vcf_infos_tags = { 8104 snpeff_hgvs: "Explode snpEff annotations", 8105 } 8106 8107 # Prefix 8108 prefix = self.get_explode_infos_prefix() 8109 if prefix: 8110 prefix = "INFO/" 8111 8112 # snpEff fields 8113 speff_ann_infos = prefix + snpeff_field 8114 speff_hgvs_infos = prefix + snpeff_hgvs 8115 8116 # Variants table 8117 table_variants = self.get_table_variants() 8118 8119 # Header 8120 vcf_reader = self.get_header() 8121 8122 # Add columns 8123 added_columns = [] 8124 8125 # Explode HGVS field in column 8126 added_columns += self.explode_infos(fields=[snpeff_field]) 8127 log.debug(f"snpeff_field={snpeff_field}") 8128 log.debug(f"added_columns={added_columns}") 8129 8130 if snpeff_field in vcf_reader.infos: 8131 8132 # Extract ANN header 8133 ann_description = vcf_reader.infos[snpeff_field].desc 8134 pattern = r"'(.+?)'" 8135 match = re.search(pattern, ann_description) 8136 if match: 8137 ann_header_match = match.group(1).split(" | ") 8138 ann_header = [] 8139 ann_header_desc = {} 8140 for i in range(len(ann_header_match)): 8141 ann_header_info = "".join( 8142 char for char in ann_header_match[i] if char.isalnum() 8143 ) 8144 ann_header.append(ann_header_info) 8145 ann_header_desc[ann_header_info] = ann_header_match[i] 8146 if not ann_header_desc: 8147 raise ValueError("Invalid header description format") 8148 else: 8149 raise ValueError("Invalid header description format") 8150 8151 # Create variant id 8152 variant_id_column = self.get_variant_id_column() 8153 added_columns += [variant_id_column] 8154 8155 # Create dataframe 8156 dataframe_snpeff_hgvs = self.get_query_to_df( 8157 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8158 ) 8159 8160 # Create snpEff columns 8161 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8162 speff_ann_infos 8163 ].apply( 8164 lambda x: explode_snpeff_ann( 8165 str(x), 8166 uniquify=uniquify, 8167 output_format=output_format, 8168 prefix=output_prefix, 8169 header=list(ann_header_desc.values()), 8170 ) 8171 ) 8172 8173 # Header 8174 ann_annotations_prefix = "" 8175 if output_format.upper() in ["JSON"]: 8176 ann_annotations_prefix = f"{output_prefix}=" 8177 vcf_reader.infos[output_prefix] = vcf.parser._Info( 8178 output_prefix, 8179 ".", 8180 "String", 8181 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8182 + " - JSON format", 8183 "howard calculation", 8184 "0", 8185 self.code_type_map.get("String"), 8186 ) 8187 else: 8188 for ann_annotation in ann_header: 8189 ann_annotation_id = f"{output_prefix}{ann_annotation}" 8190 vcf_reader.infos[ann_annotation_id] = vcf.parser._Info( 8191 ann_annotation_id, 8192 ".", 8193 "String", 8194 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8195 + f" - '{ann_header_desc[ann_annotation]}' annotation", 8196 "howard calculation", 8197 "0", 8198 self.code_type_map.get("String"), 8199 ) 8200 8201 # Update 8202 sql_update = f""" 8203 UPDATE variants 8204 SET "INFO" = 8205 concat( 8206 CASE 8207 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8208 THEN '' 8209 ELSE concat("INFO", ';') 8210 END, 8211 CASE 8212 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8213 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8214 THEN concat( 8215 '{ann_annotations_prefix}', 8216 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8217 ) 8218 ELSE '' 8219 END 8220 ) 8221 FROM dataframe_snpeff_hgvs 8222 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8223 8224 """ 8225 self.conn.execute(sql_update) 8226 8227 # Delete dataframe 8228 del dataframe_snpeff_hgvs 8229 gc.collect() 8230 8231 else: 8232 8233 log.warning( 8234 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8235 ) 8236 8237 # Remove added columns 8238 for added_column in added_columns: 8239 self.drop_column(column=added_column) 8240 8241 def calculation_extract_nomen(self) -> None: 8242 """ 8243 This function extracts the HGVS nomenclature from the calculation/identification of NOMEN. 8244 """ 8245 8246 # NOMEN field 8247 field_nomen_dict = "NOMEN_DICT" 8248 8249 # NOMEN structure 8250 nomen_dict = { 8251 "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)", 8252 "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)", 8253 "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)", 8254 "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant", 8255 "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)", 8256 "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)", 8257 "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)", 8258 "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)", 8259 "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)", 8260 "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)", 8261 } 8262 8263 # Param 8264 param = self.get_param() 8265 8266 # Prefix 8267 prefix = self.get_explode_infos_prefix() 8268 8269 # Header 8270 vcf_reader = self.get_header() 8271 8272 # Get HGVS field 8273 hgvs_field = ( 8274 param.get("calculation", {}) 8275 .get("calculations", {}) 8276 .get("NOMEN", {}) 8277 .get("options", {}) 8278 .get("hgvs_field", "hgvs") 8279 ) 8280 8281 # Get transcripts 8282 transcripts_file = ( 8283 param.get("calculation", {}) 8284 .get("calculations", {}) 8285 .get("NOMEN", {}) 8286 .get("options", {}) 8287 .get("transcripts", None) 8288 ) 8289 transcripts_file = full_path(transcripts_file) 8290 transcripts = [] 8291 if transcripts_file: 8292 if os.path.exists(transcripts_file): 8293 transcripts_dataframe = transcripts_file_to_df(transcripts_file) 8294 transcripts = transcripts_dataframe.iloc[:, 0].tolist() 8295 else: 8296 log.error(f"Transcript file '{transcripts_file}' does NOT exist") 8297 raise ValueError(f"Transcript file '{transcripts_file}' does NOT exist") 8298 8299 # Added columns 8300 added_columns = [] 8301 8302 # Explode HGVS field in column 8303 added_columns += self.explode_infos(fields=[hgvs_field]) 8304 8305 # extra infos 8306 extra_infos = self.get_extra_infos() 8307 extra_field = prefix + hgvs_field 8308 8309 if extra_field in extra_infos: 8310 8311 # Create dataframe 8312 dataframe_hgvs = self.get_query_to_df( 8313 f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" FROM variants """ 8314 ) 8315 8316 # Create main NOMEN column 8317 dataframe_hgvs[field_nomen_dict] = dataframe_hgvs[extra_field].apply( 8318 lambda x: find_nomen(str(x), transcripts=transcripts) 8319 ) 8320 8321 # Explode NOMEN Structure and create SQL set for update 8322 sql_nomen_fields = [] 8323 for nomen_field in nomen_dict: 8324 8325 # Explode each field into a column 8326 dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply( 8327 lambda x: dict(x).get(nomen_field, "") 8328 ) 8329 8330 # Create VCF header field 8331 vcf_reader.infos[nomen_field] = vcf.parser._Info( 8332 nomen_field, 8333 ".", 8334 "String", 8335 nomen_dict.get(nomen_field, "howard calculation NOMEN"), 8336 "howard calculation", 8337 "0", 8338 self.code_type_map.get("String"), 8339 ) 8340 sql_nomen_fields.append( 8341 f""" 8342 CASE 8343 WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('') 8344 THEN concat( 8345 ';{nomen_field}=', 8346 dataframe_hgvs."{nomen_field}" 8347 ) 8348 ELSE '' 8349 END 8350 """ 8351 ) 8352 8353 # SQL set for update 8354 sql_nomen_fields_set = ", ".join(sql_nomen_fields) 8355 8356 # Update 8357 sql_update = f""" 8358 UPDATE variants 8359 SET "INFO" = 8360 concat( 8361 CASE 8362 WHEN "INFO" IS NULL 8363 THEN '' 8364 ELSE "INFO" 8365 END, 8366 {sql_nomen_fields_set} 8367 ) 8368 FROM dataframe_hgvs 8369 WHERE variants."#CHROM" = dataframe_hgvs."#CHROM" 8370 AND variants."POS" = dataframe_hgvs."POS" 8371 AND variants."REF" = dataframe_hgvs."REF" 8372 AND variants."ALT" = dataframe_hgvs."ALT" 8373 """ 8374 self.conn.execute(sql_update) 8375 8376 # Delete dataframe 8377 del dataframe_hgvs 8378 gc.collect() 8379 8380 # Remove added columns 8381 for added_column in added_columns: 8382 self.drop_column(column=added_column) 8383 8384 def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None: 8385 """ 8386 The function `calculation_find_by_pipeline` performs a calculation to find the number of 8387 pipeline/sample for a variant and updates the variant information in a VCF file. 8388 8389 :param tag: The `tag` parameter is a string that represents the annotation field for the 8390 "findbypipeline" information in the VCF file. It is used to create the annotation field in the 8391 VCF header and to update the corresponding field in the variants table, defaults to 8392 findbypipeline 8393 :type tag: str (optional) 8394 """ 8395 8396 # if FORMAT and samples 8397 if ( 8398 "FORMAT" in self.get_header_columns_as_list() 8399 and self.get_header_sample_list() 8400 ): 8401 8402 # findbypipeline annotation field 8403 findbypipeline_tag = tag 8404 8405 # VCF infos tags 8406 vcf_infos_tags = { 8407 findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})", 8408 } 8409 8410 # Prefix 8411 prefix = self.get_explode_infos_prefix() 8412 8413 # Field 8414 findbypipeline_infos = prefix + findbypipeline_tag 8415 8416 # Variants table 8417 table_variants = self.get_table_variants() 8418 8419 # Header 8420 vcf_reader = self.get_header() 8421 8422 # Create variant id 8423 variant_id_column = self.get_variant_id_column() 8424 added_columns = [variant_id_column] 8425 8426 # variant_id, FORMAT and samples 8427 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8428 self.get_header_sample_list() 8429 ) 8430 8431 # Create dataframe 8432 dataframe_findbypipeline = self.get_query_to_df( 8433 f""" SELECT {samples_fields} FROM {table_variants} """ 8434 ) 8435 8436 # Create findbypipeline column 8437 dataframe_findbypipeline[findbypipeline_infos] = ( 8438 dataframe_findbypipeline.apply( 8439 lambda row: findbypipeline( 8440 row, samples=self.get_header_sample_list() 8441 ), 8442 axis=1, 8443 ) 8444 ) 8445 8446 # Add snpeff_hgvs to header 8447 vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info( 8448 findbypipeline_tag, 8449 ".", 8450 "String", 8451 vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"), 8452 "howard calculation", 8453 "0", 8454 self.code_type_map.get("String"), 8455 ) 8456 8457 # Update 8458 sql_update = f""" 8459 UPDATE variants 8460 SET "INFO" = 8461 concat( 8462 CASE 8463 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8464 THEN '' 8465 ELSE concat("INFO", ';') 8466 END, 8467 CASE 8468 WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.') 8469 AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL 8470 THEN concat( 8471 '{findbypipeline_tag}=', 8472 dataframe_findbypipeline."{findbypipeline_infos}" 8473 ) 8474 ELSE '' 8475 END 8476 ) 8477 FROM dataframe_findbypipeline 8478 WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}" 8479 """ 8480 self.conn.execute(sql_update) 8481 8482 # Remove added columns 8483 for added_column in added_columns: 8484 self.drop_column(column=added_column) 8485 8486 # Delete dataframe 8487 del dataframe_findbypipeline 8488 gc.collect() 8489 8490 def calculation_genotype_concordance(self) -> None: 8491 """ 8492 The function `calculation_genotype_concordance` calculates the genotype concordance for 8493 multi-caller VCF files and updates the variant information in the database. 8494 """ 8495 8496 # if FORMAT and samples 8497 if ( 8498 "FORMAT" in self.get_header_columns_as_list() 8499 and self.get_header_sample_list() 8500 ): 8501 8502 # genotypeconcordance annotation field 8503 genotypeconcordance_tag = "genotypeconcordance" 8504 8505 # VCF infos tags 8506 vcf_infos_tags = { 8507 genotypeconcordance_tag: "Concordance of genotype for multi caller VCF", 8508 } 8509 8510 # Prefix 8511 prefix = self.get_explode_infos_prefix() 8512 8513 # Field 8514 genotypeconcordance_infos = prefix + genotypeconcordance_tag 8515 8516 # Variants table 8517 table_variants = self.get_table_variants() 8518 8519 # Header 8520 vcf_reader = self.get_header() 8521 8522 # Create variant id 8523 variant_id_column = self.get_variant_id_column() 8524 added_columns = [variant_id_column] 8525 8526 # variant_id, FORMAT and samples 8527 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8528 self.get_header_sample_list() 8529 ) 8530 8531 # Create dataframe 8532 dataframe_genotypeconcordance = self.get_query_to_df( 8533 f""" SELECT {samples_fields} FROM {table_variants} """ 8534 ) 8535 8536 # Create genotypeconcordance column 8537 dataframe_genotypeconcordance[genotypeconcordance_infos] = ( 8538 dataframe_genotypeconcordance.apply( 8539 lambda row: genotypeconcordance( 8540 row, samples=self.get_header_sample_list() 8541 ), 8542 axis=1, 8543 ) 8544 ) 8545 8546 # Add genotypeconcordance to header 8547 vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info( 8548 genotypeconcordance_tag, 8549 ".", 8550 "String", 8551 vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"), 8552 "howard calculation", 8553 "0", 8554 self.code_type_map.get("String"), 8555 ) 8556 8557 # Update 8558 sql_update = f""" 8559 UPDATE variants 8560 SET "INFO" = 8561 concat( 8562 CASE 8563 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8564 THEN '' 8565 ELSE concat("INFO", ';') 8566 END, 8567 CASE 8568 WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.') 8569 AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL 8570 THEN concat( 8571 '{genotypeconcordance_tag}=', 8572 dataframe_genotypeconcordance."{genotypeconcordance_infos}" 8573 ) 8574 ELSE '' 8575 END 8576 ) 8577 FROM dataframe_genotypeconcordance 8578 WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}" 8579 """ 8580 self.conn.execute(sql_update) 8581 8582 # Remove added columns 8583 for added_column in added_columns: 8584 self.drop_column(column=added_column) 8585 8586 # Delete dataframe 8587 del dataframe_genotypeconcordance 8588 gc.collect() 8589 8590 def calculation_barcode(self, tag: str = "barcode") -> None: 8591 """ 8592 The `calculation_barcode` function calculates barcode values for variants in a VCF file and 8593 updates the INFO field in the file with the calculated barcode values. 8594 8595 :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag 8596 name that will be used for the barcode calculation in the VCF file. If no tag name is provided, 8597 the default tag name is set to "barcode", defaults to barcode 8598 :type tag: str (optional) 8599 """ 8600 8601 # if FORMAT and samples 8602 if ( 8603 "FORMAT" in self.get_header_columns_as_list() 8604 and self.get_header_sample_list() 8605 ): 8606 8607 # barcode annotation field 8608 if not tag: 8609 tag = "barcode" 8610 8611 # VCF infos tags 8612 vcf_infos_tags = { 8613 tag: "barcode calculation (VaRank)", 8614 } 8615 8616 # Prefix 8617 prefix = self.get_explode_infos_prefix() 8618 8619 # Field 8620 barcode_infos = prefix + tag 8621 8622 # Variants table 8623 table_variants = self.get_table_variants() 8624 8625 # Header 8626 vcf_reader = self.get_header() 8627 8628 # Create variant id 8629 variant_id_column = self.get_variant_id_column() 8630 added_columns = [variant_id_column] 8631 8632 # variant_id, FORMAT and samples 8633 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8634 self.get_header_sample_list() 8635 ) 8636 8637 # Create dataframe 8638 dataframe_barcode = self.get_query_to_df( 8639 f""" SELECT {samples_fields} FROM {table_variants} """ 8640 ) 8641 8642 # Create barcode column 8643 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 8644 lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1 8645 ) 8646 8647 # Add barcode to header 8648 vcf_reader.infos[tag] = vcf.parser._Info( 8649 tag, 8650 ".", 8651 "String", 8652 vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)), 8653 "howard calculation", 8654 "0", 8655 self.code_type_map.get("String"), 8656 ) 8657 8658 # Update 8659 sql_update = f""" 8660 UPDATE {table_variants} 8661 SET "INFO" = 8662 concat( 8663 CASE 8664 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8665 THEN '' 8666 ELSE concat("INFO", ';') 8667 END, 8668 CASE 8669 WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.') 8670 AND dataframe_barcode."{barcode_infos}" NOT NULL 8671 THEN concat( 8672 '{tag}=', 8673 dataframe_barcode."{barcode_infos}" 8674 ) 8675 ELSE '' 8676 END 8677 ) 8678 FROM dataframe_barcode 8679 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 8680 """ 8681 self.conn.execute(sql_update) 8682 8683 # Remove added columns 8684 for added_column in added_columns: 8685 self.drop_column(column=added_column) 8686 8687 # Delete dataframe 8688 del dataframe_barcode 8689 gc.collect() 8690 8691 def calculation_barcode_family(self, tag: str = "BCF") -> None: 8692 """ 8693 The `calculation_barcode_family` function calculates barcode values for variants in a VCF file 8694 and updates the INFO field in the file with the calculated barcode values. 8695 8696 :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify 8697 the barcode tag that will be added to the VCF file during the calculation process. If no value 8698 is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF 8699 :type tag: str (optional) 8700 """ 8701 8702 # if FORMAT and samples 8703 if ( 8704 "FORMAT" in self.get_header_columns_as_list() 8705 and self.get_header_sample_list() 8706 ): 8707 8708 # barcode annotation field 8709 if not tag: 8710 tag = "BCF" 8711 8712 # VCF infos tags 8713 vcf_infos_tags = { 8714 tag: "barcode family calculation", 8715 f"{tag}S": "barcode family samples", 8716 } 8717 8718 # Param 8719 param = self.get_param() 8720 log.debug(f"param={param}") 8721 8722 # Prefix 8723 prefix = self.get_explode_infos_prefix() 8724 8725 # PED param 8726 ped = ( 8727 param.get("calculation", {}) 8728 .get("calculations", {}) 8729 .get("BARCODEFAMILY", {}) 8730 .get("family_pedigree", None) 8731 ) 8732 log.debug(f"ped={ped}") 8733 8734 # Load PED 8735 if ped: 8736 8737 # Pedigree is a file 8738 if isinstance(ped, str) and os.path.exists(full_path(ped)): 8739 log.debug("Pedigree is file") 8740 with open(full_path(ped)) as ped: 8741 ped = json.load(ped) 8742 8743 # Pedigree is a string 8744 elif isinstance(ped, str): 8745 log.debug("Pedigree is str") 8746 try: 8747 ped = json.loads(ped) 8748 log.debug("Pedigree is json str") 8749 except ValueError as e: 8750 ped_samples = ped.split(",") 8751 ped = {} 8752 for ped_sample in ped_samples: 8753 ped[ped_sample] = ped_sample 8754 8755 # Pedigree is a dict 8756 elif isinstance(ped, dict): 8757 log.debug("Pedigree is dict") 8758 8759 # Pedigree is not well formatted 8760 else: 8761 msg_error = "Pedigree not well formatted" 8762 log.error(msg_error) 8763 raise ValueError(msg_error) 8764 8765 # Construct list 8766 ped_samples = list(ped.values()) 8767 8768 else: 8769 log.debug("Pedigree not defined. Take all samples") 8770 ped_samples = self.get_header_sample_list() 8771 ped = {} 8772 for ped_sample in ped_samples: 8773 ped[ped_sample] = ped_sample 8774 8775 # Check pedigree 8776 if not ped or len(ped) == 0: 8777 msg_error = f"Error in pedigree: samples {ped_samples}" 8778 log.error(msg_error) 8779 raise ValueError(msg_error) 8780 8781 # Log 8782 log.info( 8783 "Calculation 'BARCODEFAMILY' - Samples: " 8784 + ", ".join([f"{member}='{ped[member]}'" for member in ped]) 8785 ) 8786 log.debug(f"ped_samples={ped_samples}") 8787 8788 # Field 8789 barcode_infos = prefix + tag 8790 8791 # Variants table 8792 table_variants = self.get_table_variants() 8793 8794 # Header 8795 vcf_reader = self.get_header() 8796 8797 # Create variant id 8798 variant_id_column = self.get_variant_id_column() 8799 added_columns = [variant_id_column] 8800 8801 # variant_id, FORMAT and samples 8802 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8803 ped_samples 8804 ) 8805 8806 # Create dataframe 8807 dataframe_barcode = self.get_query_to_df( 8808 f""" SELECT {samples_fields} FROM {table_variants} """ 8809 ) 8810 8811 # Create barcode column 8812 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 8813 lambda row: barcode(row, samples=ped_samples), axis=1 8814 ) 8815 8816 # Add barcode family to header 8817 # Add vaf_normalization to header 8818 vcf_reader.formats[tag] = vcf.parser._Format( 8819 id=tag, 8820 num=".", 8821 type="String", 8822 desc=vcf_infos_tags.get(tag, "barcode family calculation"), 8823 type_code=self.code_type_map.get("String"), 8824 ) 8825 vcf_reader.formats[f"{tag}S"] = vcf.parser._Format( 8826 id=f"{tag}S", 8827 num=".", 8828 type="String", 8829 desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"), 8830 type_code=self.code_type_map.get("String"), 8831 ) 8832 8833 # Update 8834 # for sample in ped_samples: 8835 sql_update_set = [] 8836 for sample in self.get_header_sample_list() + ["FORMAT"]: 8837 if sample in ped_samples: 8838 value = f'dataframe_barcode."{barcode_infos}"' 8839 value_samples = "'" + ",".join(ped_samples) + "'" 8840 elif sample == "FORMAT": 8841 value = f"'{tag}'" 8842 value_samples = f"'{tag}S'" 8843 else: 8844 value = "'.'" 8845 value_samples = "'.'" 8846 format_regex = r"[a-zA-Z0-9\s]" 8847 sql_update_set.append( 8848 f""" 8849 "{sample}" = 8850 concat( 8851 CASE 8852 WHEN {table_variants}."{sample}" = './.' 8853 THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g')) 8854 ELSE {table_variants}."{sample}" 8855 END, 8856 ':', 8857 {value}, 8858 ':', 8859 {value_samples} 8860 ) 8861 """ 8862 ) 8863 8864 sql_update_set_join = ", ".join(sql_update_set) 8865 sql_update = f""" 8866 UPDATE {table_variants} 8867 SET {sql_update_set_join} 8868 FROM dataframe_barcode 8869 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 8870 """ 8871 self.conn.execute(sql_update) 8872 8873 # Remove added columns 8874 for added_column in added_columns: 8875 self.drop_column(column=added_column) 8876 8877 # Delete dataframe 8878 del dataframe_barcode 8879 gc.collect() 8880 8881 def calculation_trio(self) -> None: 8882 """ 8883 The `calculation_trio` function performs trio calculations on a VCF file by adding trio 8884 information to the INFO field of each variant. 8885 """ 8886 8887 # if FORMAT and samples 8888 if ( 8889 "FORMAT" in self.get_header_columns_as_list() 8890 and self.get_header_sample_list() 8891 ): 8892 8893 # trio annotation field 8894 trio_tag = "trio" 8895 8896 # VCF infos tags 8897 vcf_infos_tags = { 8898 "trio": "trio calculation", 8899 } 8900 8901 # Param 8902 param = self.get_param() 8903 8904 # Prefix 8905 prefix = self.get_explode_infos_prefix() 8906 8907 # Trio param 8908 trio_ped = ( 8909 param.get("calculation", {}) 8910 .get("calculations", {}) 8911 .get("TRIO", {}) 8912 .get("trio_pedigree", None) 8913 ) 8914 8915 # Load trio 8916 if trio_ped: 8917 8918 # Trio pedigree is a file 8919 if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)): 8920 log.debug("TRIO pedigree is file") 8921 with open(full_path(trio_ped)) as trio_ped: 8922 trio_ped = json.load(trio_ped) 8923 8924 # Trio pedigree is a string 8925 elif isinstance(trio_ped, str): 8926 log.debug("TRIO pedigree is str") 8927 try: 8928 trio_ped = json.loads(trio_ped) 8929 log.debug("TRIO pedigree is json str") 8930 except ValueError as e: 8931 trio_samples = trio_ped.split(",") 8932 if len(trio_samples) == 3: 8933 trio_ped = { 8934 "father": trio_samples[0], 8935 "mother": trio_samples[1], 8936 "child": trio_samples[2], 8937 } 8938 log.debug("TRIO pedigree is list str") 8939 else: 8940 msg_error = "TRIO pedigree not well formatted" 8941 log.error(msg_error) 8942 raise ValueError(msg_error) 8943 8944 # Trio pedigree is a dict 8945 elif isinstance(trio_ped, dict): 8946 log.debug("TRIO pedigree is dict") 8947 8948 # Trio pedigree is not well formatted 8949 else: 8950 msg_error = "TRIO pedigree not well formatted" 8951 log.error(msg_error) 8952 raise ValueError(msg_error) 8953 8954 # Construct trio list 8955 trio_samples = [ 8956 trio_ped.get("father", ""), 8957 trio_ped.get("mother", ""), 8958 trio_ped.get("child", ""), 8959 ] 8960 8961 else: 8962 log.debug("TRIO pedigree not defined. Take the first 3 samples") 8963 samples_list = self.get_header_sample_list() 8964 if len(samples_list) >= 3: 8965 trio_samples = self.get_header_sample_list()[0:3] 8966 trio_ped = { 8967 "father": trio_samples[0], 8968 "mother": trio_samples[1], 8969 "child": trio_samples[2], 8970 } 8971 else: 8972 msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}" 8973 log.error(msg_error) 8974 raise ValueError(msg_error) 8975 8976 # Check trio pedigree 8977 if not trio_ped or len(trio_ped) != 3: 8978 msg_error = f"Error in TRIO pedigree: {trio_ped}" 8979 log.error(msg_error) 8980 raise ValueError(msg_error) 8981 8982 # Log 8983 log.info( 8984 f"Calculation 'TRIO' - Samples: " 8985 + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped]) 8986 ) 8987 8988 # Field 8989 trio_infos = prefix + trio_tag 8990 8991 # Variants table 8992 table_variants = self.get_table_variants() 8993 8994 # Header 8995 vcf_reader = self.get_header() 8996 8997 # Create variant id 8998 variant_id_column = self.get_variant_id_column() 8999 added_columns = [variant_id_column] 9000 9001 # variant_id, FORMAT and samples 9002 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9003 self.get_header_sample_list() 9004 ) 9005 9006 # Create dataframe 9007 dataframe_trio = self.get_query_to_df( 9008 f""" SELECT {samples_fields} FROM {table_variants} """ 9009 ) 9010 9011 # Create trio column 9012 dataframe_trio[trio_infos] = dataframe_trio.apply( 9013 lambda row: trio(row, samples=trio_samples), axis=1 9014 ) 9015 9016 # Add trio to header 9017 vcf_reader.infos[trio_tag] = vcf.parser._Info( 9018 trio_tag, 9019 ".", 9020 "String", 9021 vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"), 9022 "howard calculation", 9023 "0", 9024 self.code_type_map.get("String"), 9025 ) 9026 9027 # Update 9028 sql_update = f""" 9029 UPDATE {table_variants} 9030 SET "INFO" = 9031 concat( 9032 CASE 9033 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9034 THEN '' 9035 ELSE concat("INFO", ';') 9036 END, 9037 CASE 9038 WHEN dataframe_trio."{trio_infos}" NOT IN ('','.') 9039 AND dataframe_trio."{trio_infos}" NOT NULL 9040 THEN concat( 9041 '{trio_tag}=', 9042 dataframe_trio."{trio_infos}" 9043 ) 9044 ELSE '' 9045 END 9046 ) 9047 FROM dataframe_trio 9048 WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}" 9049 """ 9050 self.conn.execute(sql_update) 9051 9052 # Remove added columns 9053 for added_column in added_columns: 9054 self.drop_column(column=added_column) 9055 9056 # Delete dataframe 9057 del dataframe_trio 9058 gc.collect() 9059 9060 def calculation_vaf_normalization(self) -> None: 9061 """ 9062 The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency) 9063 normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly. 9064 :return: The function does not return anything. 9065 """ 9066 9067 # if FORMAT and samples 9068 if ( 9069 "FORMAT" in self.get_header_columns_as_list() 9070 and self.get_header_sample_list() 9071 ): 9072 9073 # vaf_normalization annotation field 9074 vaf_normalization_tag = "VAF" 9075 9076 # VCF infos tags 9077 vcf_infos_tags = { 9078 "VAF": "VAF Variant Frequency", 9079 } 9080 9081 # Prefix 9082 prefix = self.get_explode_infos_prefix() 9083 9084 # Variants table 9085 table_variants = self.get_table_variants() 9086 9087 # Header 9088 vcf_reader = self.get_header() 9089 9090 # Do not calculate if VAF already exists 9091 if "VAF" in vcf_reader.formats: 9092 log.debug("VAF already on genotypes") 9093 return 9094 9095 # Create variant id 9096 variant_id_column = self.get_variant_id_column() 9097 added_columns = [variant_id_column] 9098 9099 # variant_id, FORMAT and samples 9100 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9101 f""" "{sample}" """ for sample in self.get_header_sample_list() 9102 ) 9103 9104 # Create dataframe 9105 query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """ 9106 log.debug(f"query={query}") 9107 dataframe_vaf_normalization = self.get_query_to_df(query=query) 9108 9109 vaf_normalization_set = [] 9110 9111 # for each sample vaf_normalization 9112 for sample in self.get_header_sample_list(): 9113 dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply( 9114 lambda row: vaf_normalization(row, sample=sample), axis=1 9115 ) 9116 vaf_normalization_set.append( 9117 f""" "{sample}" = dataframe_vaf_normalization."{sample}" """ 9118 ) 9119 9120 # Add VAF to FORMAT 9121 dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[ 9122 "FORMAT" 9123 ].apply(lambda x: str(x) + ":VAF") 9124 vaf_normalization_set.append( 9125 f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """ 9126 ) 9127 9128 # Add vaf_normalization to header 9129 vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format( 9130 id=vaf_normalization_tag, 9131 num="1", 9132 type="Float", 9133 desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"), 9134 type_code=self.code_type_map.get("Float"), 9135 ) 9136 9137 # Create fields to add in INFO 9138 sql_vaf_normalization_set = " , ".join(vaf_normalization_set) 9139 9140 # Update 9141 sql_update = f""" 9142 UPDATE {table_variants} 9143 SET {sql_vaf_normalization_set} 9144 FROM dataframe_vaf_normalization 9145 WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}" 9146 9147 """ 9148 self.conn.execute(sql_update) 9149 9150 # Remove added columns 9151 for added_column in added_columns: 9152 self.drop_column(column=added_column) 9153 9154 # Delete dataframe 9155 del dataframe_vaf_normalization 9156 gc.collect() 9157 9158 def calculation_genotype_stats(self, info: str = "VAF") -> None: 9159 """ 9160 The `calculation_genotype_stats` function calculates genotype statistics for a given information 9161 field in a VCF file and updates the INFO column of the variants table with the calculated 9162 statistics. 9163 9164 :param info: The `info` parameter is a string that represents the type of information for which 9165 genotype statistics are calculated. It is used to generate various VCF info tags for the 9166 statistics, such as the number of occurrences, the list of values, the minimum value, the 9167 maximum value, the mean, the median, defaults to VAF 9168 :type info: str (optional) 9169 """ 9170 9171 # if FORMAT and samples 9172 if ( 9173 "FORMAT" in self.get_header_columns_as_list() 9174 and self.get_header_sample_list() 9175 ): 9176 9177 # vaf_stats annotation field 9178 vaf_stats_tag = info + "_stats" 9179 9180 # VCF infos tags 9181 vcf_infos_tags = { 9182 info + "_stats_nb": f"genotype {info} Statistics - number of {info}", 9183 info + "_stats_list": f"genotype {info} Statistics - list of {info}", 9184 info + "_stats_min": f"genotype {info} Statistics - min {info}", 9185 info + "_stats_max": f"genotype {info} Statistics - max {info}", 9186 info + "_stats_mean": f"genotype {info} Statistics - mean {info}", 9187 info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}", 9188 info 9189 + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}", 9190 } 9191 9192 # Prefix 9193 prefix = self.get_explode_infos_prefix() 9194 9195 # Field 9196 vaf_stats_infos = prefix + vaf_stats_tag 9197 9198 # Variants table 9199 table_variants = self.get_table_variants() 9200 9201 # Header 9202 vcf_reader = self.get_header() 9203 9204 # Create variant id 9205 variant_id_column = self.get_variant_id_column() 9206 added_columns = [variant_id_column] 9207 9208 # variant_id, FORMAT and samples 9209 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9210 self.get_header_sample_list() 9211 ) 9212 9213 # Create dataframe 9214 dataframe_vaf_stats = self.get_query_to_df( 9215 f""" SELECT {samples_fields} FROM {table_variants} """ 9216 ) 9217 9218 # Create vaf_stats column 9219 dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply( 9220 lambda row: genotype_stats( 9221 row, samples=self.get_header_sample_list(), info=info 9222 ), 9223 axis=1, 9224 ) 9225 9226 # List of vcf tags 9227 sql_vaf_stats_fields = [] 9228 9229 # Check all VAF stats infos 9230 for stat in vcf_infos_tags: 9231 9232 # Extract stats 9233 dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply( 9234 lambda x: dict(x).get(stat, "") 9235 ) 9236 9237 # Add snpeff_hgvs to header 9238 vcf_reader.infos[stat] = vcf.parser._Info( 9239 stat, 9240 ".", 9241 "String", 9242 vcf_infos_tags.get(stat, "genotype statistics"), 9243 "howard calculation", 9244 "0", 9245 self.code_type_map.get("String"), 9246 ) 9247 9248 if len(sql_vaf_stats_fields): 9249 sep = ";" 9250 else: 9251 sep = "" 9252 9253 # Create fields to add in INFO 9254 sql_vaf_stats_fields.append( 9255 f""" 9256 CASE 9257 WHEN dataframe_vaf_stats."{stat}" NOT NULL 9258 THEN concat( 9259 '{sep}{stat}=', 9260 dataframe_vaf_stats."{stat}" 9261 ) 9262 ELSE '' 9263 END 9264 """ 9265 ) 9266 9267 # SQL set for update 9268 sql_vaf_stats_fields_set = ", ".join(sql_vaf_stats_fields) 9269 9270 # Update 9271 sql_update = f""" 9272 UPDATE {table_variants} 9273 SET "INFO" = 9274 concat( 9275 CASE 9276 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9277 THEN '' 9278 ELSE concat("INFO", ';') 9279 END, 9280 {sql_vaf_stats_fields_set} 9281 ) 9282 FROM dataframe_vaf_stats 9283 WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}" 9284 9285 """ 9286 self.conn.execute(sql_update) 9287 9288 # Remove added columns 9289 for added_column in added_columns: 9290 self.drop_column(column=added_column) 9291 9292 # Delete dataframe 9293 del dataframe_vaf_stats 9294 gc.collect() 9295 9296 def calculation_transcripts_json(self, info: str = "transcripts_json") -> None: 9297 """ 9298 The function `calculation_transcripts_json` creates a transcripts table and adds an info field 9299 to it if transcripts are available. 9300 9301 :param info: The `info` parameter in the `calculation_transcripts_json` method is a string 9302 parameter that specifies the information field to be used in the transcripts JSON. It has a 9303 default value of "transcripts_json" if no value is provided when calling the method, defaults to 9304 transcripts_json 9305 :type info: str (optional) 9306 """ 9307 9308 # Create transcripts table 9309 transcripts_table = self.create_transcript_view() 9310 9311 # Add info field 9312 if transcripts_table: 9313 self.transcript_view_to_variants( 9314 transcripts_table=transcripts_table, transcripts_info_field=info 9315 ) 9316 else: 9317 log.info("No Transcripts to process. Check param.json file configuration") 9318 9319 def calculation_transcripts_prioritization(self) -> None: 9320 """ 9321 The function `calculation_transcripts_prioritization` creates a transcripts table and 9322 prioritizes transcripts based on certain criteria. 9323 """ 9324 9325 # Create transcripts table 9326 transcripts_table = self.create_transcript_view() 9327 9328 # Add info field 9329 if transcripts_table: 9330 self.transcripts_prioritization(transcripts_table=transcripts_table) 9331 else: 9332 log.info("No Transcripts to process. Check param.json file configuration") 9333 9334 ############### 9335 # Transcripts # 9336 ############### 9337 9338 def transcripts_prioritization( 9339 self, transcripts_table: str = None, param: dict = {} 9340 ) -> bool: 9341 """ 9342 The `transcripts_prioritization` function prioritizes transcripts based on certain parameters 9343 and updates the variants table with the prioritized information. 9344 9345 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 9346 of the table containing transcripts data. If no value is provided, it defaults to "transcripts". 9347 This parameter is used to identify the table where the transcripts data is stored for the 9348 prioritization process 9349 :type transcripts_table: str 9350 :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary 9351 that contains various configuration settings for the prioritization process of transcripts. It 9352 is used to customize the behavior of the prioritization algorithm and includes settings such as 9353 the prefix for prioritization fields, default profiles, and other 9354 :type param: dict 9355 :return: The function `transcripts_prioritization` returns a boolean value `True` if the 9356 transcripts prioritization process is successfully completed, and `False` if there are any 9357 issues or if no profile is defined for transcripts prioritization. 9358 """ 9359 9360 log.debug("Start transcripts prioritization...") 9361 9362 # Param 9363 if not param: 9364 param = self.get_param() 9365 9366 # Variants table 9367 table_variants = self.get_table_variants() 9368 log.debug(f"transcripts_table={transcripts_table}") 9369 # Transcripts table 9370 if transcripts_table is None: 9371 log.debug(f"transcripts_table={transcripts_table}") 9372 transcripts_table = self.create_transcript_view( 9373 transcripts_table="transcripts", param=param 9374 ) 9375 log.debug(f"transcripts_table={transcripts_table}") 9376 if transcripts_table is None: 9377 msg_err = "No Transcripts table availalble" 9378 log.error(msg_err) 9379 raise ValueError(msg_err) 9380 9381 # Get transcripts columns 9382 columns_as_list_query = f""" 9383 DESCRIBE {transcripts_table} 9384 """ 9385 columns_as_list = list( 9386 self.get_query_to_df(columns_as_list_query)["column_name"] 9387 ) 9388 9389 # Create INFO if not exists 9390 if "INFO" not in columns_as_list: 9391 query_add_info = f""" 9392 ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT ''; 9393 """ 9394 self.execute_query(query_add_info) 9395 9396 # Prioritization param and Force only PZ Score and Flag 9397 pz_param = param.get("transcripts", {}).get("prioritization", {}) 9398 pz_fields_score = pz_param.get("pzprefix", "PTZ") + "Score" 9399 pz_fields_flag = pz_param.get("pzprefix", "PTZ") + "Flag" 9400 pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript" 9401 pz_param["pzfields"] = [pz_fields_score, pz_fields_flag] 9402 pz_profile_default = ( 9403 param.get("transcripts", {}).get("prioritization", {}).get("profiles", None) 9404 ) 9405 9406 # Exit if no profile 9407 if pz_profile_default is None: 9408 log.warning("No profile defined for transcripts prioritization") 9409 return False 9410 9411 # Prioritization 9412 prioritization_result = self.prioritization( 9413 table=transcripts_table, 9414 pz_param=param.get("transcripts", {}).get("prioritization", {}), 9415 ) 9416 if not prioritization_result: 9417 log.warning("Transcripts prioritization not processed") 9418 return False 9419 9420 # Explode PZ fields 9421 self.explode_infos( 9422 table=transcripts_table, 9423 fields=param.get("transcripts", {}) 9424 .get("prioritization", {}) 9425 .get("pzfields", []), 9426 ) 9427 9428 # Export Transcripts prioritization infos to variants table 9429 query_update = f""" 9430 WITH RankedTranscripts AS ( 9431 SELECT 9432 "#CHROM", POS, REF, ALT, transcript, {pz_fields_score}, {pz_fields_flag}, 9433 ROW_NUMBER() OVER ( 9434 PARTITION BY "#CHROM", POS, REF, ALT 9435 ORDER BY {pz_fields_flag} ASC, {pz_fields_score} DESC, transcript ASC 9436 ) AS rn 9437 FROM 9438 {transcripts_table} 9439 ) 9440 UPDATE {table_variants} 9441 SET 9442 INFO = CONCAT(CASE 9443 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9444 THEN '' 9445 ELSE concat("INFO", ';') 9446 END, 9447 concat('{pz_fields_transcripts}=', transcript, ';{pz_fields_score}=', {pz_fields_score}, ';{pz_fields_flag}=', {pz_fields_flag}) 9448 ) 9449 FROM 9450 RankedTranscripts 9451 WHERE 9452 rn = 1 9453 AND variants."#CHROM" = RankedTranscripts."#CHROM" 9454 AND variants."POS" = RankedTranscripts."POS" 9455 AND variants."REF" = RankedTranscripts."REF" 9456 AND variants."ALT" = RankedTranscripts."ALT" 9457 9458 """ 9459 self.execute_query(query=query_update) 9460 9461 # Add PZ Transcript in header 9462 self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info( 9463 pz_fields_transcripts, 9464 ".", 9465 "String", 9466 f"Transcript selected from transcripts prioritization process, profile {pz_profile_default}", 9467 "unknown", 9468 "unknown", 9469 code_type_map["String"], 9470 ) 9471 9472 # Return 9473 return True 9474 9475 def create_transcript_view_from_columns_map( 9476 self, 9477 transcripts_table: str = "transcripts", 9478 columns_maps: dict = {}, 9479 added_columns: list = [], 9480 temporary_tables: list = None, 9481 annotation_fields: list = None, 9482 ) -> tuple[list, list, list]: 9483 """ 9484 The `create_transcript_view_from_columns_map` function generates a temporary table view based on 9485 specified columns mapping for transcripts data. 9486 9487 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of 9488 the table where the transcripts data is stored or will be stored in the database. This table 9489 typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores, 9490 predictions, etc. It defaults to "transcripts, defaults to transcripts 9491 :type transcripts_table: str (optional) 9492 :param columns_maps: The `columns_maps` parameter is a dictionary that contains information about 9493 how to map columns from a transcripts table to create a view. Each entry in the `columns_maps` list 9494 represents a mapping configuration for a specific set of columns. It typically includes details such 9495 as the main transcript column and additional information columns 9496 :type columns_maps: dict 9497 :param added_columns: The `added_columns` parameter in the `create_transcript_view_from_columns_map` 9498 function is a list that stores the additional columns that will be added to the view being created 9499 based on the columns map provided. These columns are generated by exploding the transcript 9500 information columns along with the main transcript column 9501 :type added_columns: list 9502 :param temporary_tables: The `temporary_tables` parameter in the 9503 `create_transcript_view_from_columns_map` function is a list that stores the names of temporary 9504 tables created during the process of creating a transcript view from a columns map. These temporary 9505 tables are used to store intermediate results or transformations before the final view is generated 9506 :type temporary_tables: list 9507 :param annotation_fields: The `annotation_fields` parameter in the 9508 `create_transcript_view_from_columns_map` function is a list that stores the fields that are used 9509 for annotation in the query view creation process. These fields are extracted from the 9510 `transcripts_column` and `transcripts_infos_columns` specified in the `columns 9511 :type annotation_fields: list 9512 :return: The function `create_transcript_view_from_columns_map` returns a tuple containing three 9513 lists: `added_columns`, `temporary_tables`, and `annotation_fields`. 9514 """ 9515 9516 log.debug("Start transcrpts view creation from columns map...") 9517 9518 # "from_columns_map": [ 9519 # { 9520 # "transcripts_column": "Ensembl_transcriptid", 9521 # "transcripts_infos_columns": [ 9522 # "genename", 9523 # "Ensembl_geneid", 9524 # "LIST_S2_score", 9525 # "LIST_S2_pred", 9526 # ], 9527 # }, 9528 # { 9529 # "transcripts_column": "Ensembl_transcriptid", 9530 # "transcripts_infos_columns": [ 9531 # "genename", 9532 # "VARITY_R_score", 9533 # "Aloft_pred", 9534 # ], 9535 # }, 9536 # ], 9537 9538 # Init 9539 if temporary_tables is None: 9540 temporary_tables = [] 9541 if annotation_fields is None: 9542 annotation_fields = [] 9543 9544 # Variants table 9545 table_variants = self.get_table_variants() 9546 9547 for columns_map in columns_maps: 9548 9549 # Transcript column 9550 transcripts_column = columns_map.get("transcripts_column", None) 9551 9552 # Transcripts infos columns 9553 transcripts_infos_columns = columns_map.get("transcripts_infos_columns", []) 9554 9555 if transcripts_column is not None: 9556 9557 # Explode 9558 added_columns += self.explode_infos( 9559 fields=[transcripts_column] + transcripts_infos_columns 9560 ) 9561 9562 # View clauses 9563 clause_select = [] 9564 for field in [transcripts_column] + transcripts_infos_columns: 9565 clause_select.append( 9566 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 9567 ) 9568 if field not in [transcripts_column]: 9569 annotation_fields.append(field) 9570 9571 # Querey View 9572 query = f""" 9573 SELECT 9574 "#CHROM", POS, REF, ALT, 9575 "{transcripts_column}" AS 'transcript', 9576 {", ".join(clause_select)} 9577 FROM ( 9578 SELECT 9579 "#CHROM", POS, REF, ALT, 9580 {", ".join(clause_select)} 9581 FROM {table_variants} 9582 ) 9583 WHERE "{transcripts_column}" IS NOT NULL 9584 """ 9585 9586 # Create temporary table 9587 temporary_table = transcripts_table + "".join( 9588 random.choices(string.ascii_uppercase + string.digits, k=10) 9589 ) 9590 9591 # Temporary_tables 9592 temporary_tables.append(temporary_table) 9593 query_view = f""" 9594 CREATE TEMPORARY TABLE {temporary_table} 9595 AS ({query}) 9596 """ 9597 self.execute_query(query=query_view) 9598 9599 return added_columns, temporary_tables, annotation_fields 9600 9601 def create_transcript_view_from_column_format( 9602 self, 9603 transcripts_table: str = "transcripts", 9604 column_formats: dict = {}, 9605 temporary_tables: list = None, 9606 annotation_fields: list = None, 9607 ) -> tuple[list, list, list]: 9608 """ 9609 The `create_transcript_view_from_column_format` function generates a transcript view based on 9610 specified column formats, adds additional columns and annotation fields, and returns the list of 9611 temporary tables and annotation fields. 9612 9613 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of 9614 the table containing the transcripts data. This table will be used as the base table for creating 9615 the transcript view. The default value for this parameter is "transcripts", but you can provide a 9616 different table name if needed, defaults to transcripts 9617 :type transcripts_table: str (optional) 9618 :param column_formats: The `column_formats` parameter is a dictionary that contains information 9619 about the columns to be used for creating the transcript view. Each entry in the dictionary 9620 specifies the mapping between a transcripts column and a transcripts infos column. For example, in 9621 the provided code snippet: 9622 :type column_formats: dict 9623 :param temporary_tables: The `temporary_tables` parameter in the 9624 `create_transcript_view_from_column_format` function is a list that stores the names of temporary 9625 views created during the process of creating a transcript view from a column format. These temporary 9626 views are used to manipulate and extract data before generating the final transcript view. It 9627 :type temporary_tables: list 9628 :param annotation_fields: The `annotation_fields` parameter in the 9629 `create_transcript_view_from_column_format` function is a list that stores the annotation fields 9630 that are extracted from the temporary views created during the process. These annotation fields are 9631 obtained by querying the temporary views and extracting the column names excluding specific columns 9632 like `#CH 9633 :type annotation_fields: list 9634 :return: The `create_transcript_view_from_column_format` function returns two lists: 9635 `temporary_tables` and `annotation_fields`. 9636 """ 9637 9638 log.debug("Start transcrpts view creation from column format...") 9639 9640 # "from_column_format": [ 9641 # { 9642 # "transcripts_column": "ANN", 9643 # "transcripts_infos_column": "Feature_ID", 9644 # } 9645 # ], 9646 9647 # Init 9648 if temporary_tables is None: 9649 temporary_tables = [] 9650 if annotation_fields is None: 9651 annotation_fields = [] 9652 9653 for column_format in column_formats: 9654 9655 # annotation field and transcript annotation field 9656 annotation_field = column_format.get("transcripts_column", "ANN") 9657 transcript_annotation = column_format.get( 9658 "transcripts_infos_column", "Feature_ID" 9659 ) 9660 9661 # Temporary View name 9662 temporary_view_name = transcripts_table + "".join( 9663 random.choices(string.ascii_uppercase + string.digits, k=10) 9664 ) 9665 9666 # Create temporary view name 9667 temporary_view_name = self.annotation_format_to_table( 9668 uniquify=True, 9669 annotation_field=annotation_field, 9670 view_name=temporary_view_name, 9671 annotation_id=transcript_annotation, 9672 ) 9673 9674 # Annotation fields 9675 if temporary_view_name: 9676 query_annotation_fields = f""" 9677 SELECT * 9678 FROM ( 9679 DESCRIBE SELECT * 9680 FROM {temporary_view_name} 9681 ) 9682 WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT') 9683 """ 9684 df_annotation_fields = self.get_query_to_df( 9685 query=query_annotation_fields 9686 ) 9687 9688 # Add temporary view and annotation fields 9689 temporary_tables.append(temporary_view_name) 9690 annotation_fields += list(set(df_annotation_fields["column_name"])) 9691 9692 return temporary_tables, annotation_fields 9693 9694 def create_transcript_view( 9695 self, 9696 transcripts_table: str = None, 9697 transcripts_table_drop: bool = True, 9698 param: dict = {}, 9699 ) -> str: 9700 """ 9701 The `create_transcript_view` function generates a transcript view by processing data from a 9702 specified table based on provided parameters and structural information. 9703 9704 :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function 9705 is used to specify the name of the table that will store the final transcript view data. If a table 9706 name is not provided, the function will create a new table to store the transcript view data, and by 9707 default,, defaults to transcripts 9708 :type transcripts_table: str (optional) 9709 :param transcripts_table_drop: The `transcripts_table_drop` parameter in the 9710 `create_transcript_view` function is a boolean parameter that determines whether to drop the 9711 existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`, 9712 the function will drop the existing transcripts table if it exists, defaults to True 9713 :type transcripts_table_drop: bool (optional) 9714 :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that 9715 contains information needed to create a transcript view. It includes details such as the structure 9716 of the transcripts, columns mapping, column formats, and other necessary information for generating 9717 the view. This parameter allows for flexibility and customization 9718 :type param: dict 9719 :return: The `create_transcript_view` function returns the name of the transcripts table that was 9720 created or modified during the execution of the function. 9721 """ 9722 9723 log.debug("Start transcripts view creation...") 9724 9725 # Default 9726 transcripts_table_default = "transcripts" 9727 9728 # Param 9729 if not param: 9730 param = self.get_param() 9731 9732 # Struct 9733 struct = param.get("transcripts", {}).get("struct", None) 9734 9735 if struct: 9736 9737 # Transcripts table 9738 if transcripts_table is None: 9739 transcripts_table = param.get("transcripts", {}).get( 9740 "table", transcripts_table_default 9741 ) 9742 9743 # added_columns 9744 added_columns = [] 9745 9746 # Temporary tables 9747 temporary_tables = [] 9748 9749 # Annotation fields 9750 annotation_fields = [] 9751 9752 # from columns map 9753 columns_maps = struct.get("from_columns_map", []) 9754 added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = ( 9755 self.create_transcript_view_from_columns_map( 9756 transcripts_table=transcripts_table, 9757 columns_maps=columns_maps, 9758 added_columns=added_columns, 9759 temporary_tables=temporary_tables, 9760 annotation_fields=annotation_fields, 9761 ) 9762 ) 9763 added_columns += added_columns_tmp 9764 temporary_tables += temporary_tables_tmp 9765 annotation_fields += annotation_fields_tmp 9766 9767 # from column format 9768 column_formats = struct.get("from_column_format", []) 9769 temporary_tables_tmp, annotation_fields_tmp = ( 9770 self.create_transcript_view_from_column_format( 9771 transcripts_table=transcripts_table, 9772 column_formats=column_formats, 9773 temporary_tables=temporary_tables, 9774 annotation_fields=annotation_fields, 9775 ) 9776 ) 9777 temporary_tables += temporary_tables_tmp 9778 annotation_fields += annotation_fields_tmp 9779 9780 # Merge temporary tables query 9781 query_merge = "" 9782 for temporary_table in temporary_tables: 9783 9784 # First temporary table 9785 if not query_merge: 9786 query_merge = f""" 9787 SELECT * FROM {temporary_table} 9788 """ 9789 # other temporary table (using UNION) 9790 else: 9791 query_merge += f""" 9792 UNION BY NAME SELECT * FROM {temporary_table} 9793 """ 9794 9795 # Merge on transcript 9796 query_merge_on_transcripts_annotation_fields = [] 9797 # Aggregate all annotations fields 9798 for annotation_field in set(annotation_fields): 9799 query_merge_on_transcripts_annotation_fields.append( 9800 f""" list_aggregate(list_distinct(array_agg({annotation_field})), 'string_agg', ',') AS {annotation_field} """ 9801 ) 9802 # Query for transcripts view 9803 query_merge_on_transcripts = f""" 9804 SELECT "#CHROM", POS, REF, ALT, transcript, {", ".join(query_merge_on_transcripts_annotation_fields)} 9805 FROM ({query_merge}) 9806 GROUP BY "#CHROM", POS, REF, ALT, transcript 9807 """ 9808 9809 # Drop transcript view is necessary 9810 if transcripts_table_drop: 9811 query_drop = f""" 9812 DROP TABLE IF EXISTS {transcripts_table}; 9813 """ 9814 self.execute_query(query=query_drop) 9815 9816 # Merge and create transcript view 9817 query_create_view = f""" 9818 CREATE TABLE IF NOT EXISTS {transcripts_table} 9819 AS {query_merge_on_transcripts} 9820 """ 9821 self.execute_query(query=query_create_view) 9822 9823 # Remove added columns 9824 for added_column in added_columns: 9825 self.drop_column(column=added_column) 9826 9827 else: 9828 9829 transcripts_table = None 9830 9831 return transcripts_table 9832 9833 def annotation_format_to_table( 9834 self, 9835 uniquify: bool = True, 9836 annotation_field: str = "ANN", 9837 annotation_id: str = "Feature_ID", 9838 view_name: str = "transcripts", 9839 ) -> str: 9840 """ 9841 The function `annotation_format_to_table` converts annotation data from a VCF file into a structured 9842 table format. 9843 9844 :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure unique 9845 values in the output or not. If set to `True`, the function will make sure that the output values 9846 are unique, defaults to True 9847 :type uniquify: bool (optional) 9848 :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file that 9849 contains the annotation information for each variant. This field is used to extract the annotation 9850 details for further processing in the function, defaults to ANN 9851 :type annotation_field: str (optional) 9852 :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method is 9853 used to specify the identifier for the annotation feature. This identifier will be used as a column 9854 name in the resulting table or view that is created based on the annotation data. It helps in 9855 uniquely identifying each annotation entry in the, defaults to Feature_ID 9856 :type annotation_id: str (optional) 9857 :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used to 9858 specify the name of the temporary table that will be created to store the transformed annotation 9859 data. This table will hold the extracted information from the annotation field in a structured 9860 format for further processing or analysis, defaults to transcripts 9861 :type view_name: str (optional) 9862 :return: The function `annotation_format_to_table` is returning the name of the view created, which 9863 is stored in the variable `view_name`. 9864 """ 9865 9866 # Annotation field 9867 annotation_format = "annotation_explode" 9868 9869 # Transcript annotation 9870 annotation_id = "".join(char for char in annotation_id if char.isalnum()) 9871 9872 # Prefix 9873 prefix = self.get_explode_infos_prefix() 9874 if prefix: 9875 prefix = "INFO/" 9876 9877 # Annotation fields 9878 annotation_infos = prefix + annotation_field 9879 annotation_format_infos = prefix + annotation_format 9880 9881 # Variants table 9882 table_variants = self.get_table_variants() 9883 9884 # Header 9885 vcf_reader = self.get_header() 9886 9887 # Add columns 9888 added_columns = [] 9889 9890 # Explode HGVS field in column 9891 added_columns += self.explode_infos(fields=[annotation_field]) 9892 9893 if annotation_field in vcf_reader.infos: 9894 9895 # Extract ANN header 9896 ann_description = vcf_reader.infos[annotation_field].desc 9897 pattern = r"'(.+?)'" 9898 match = re.search(pattern, ann_description) 9899 if match: 9900 ann_header_match = match.group(1).split(" | ") 9901 ann_header = [] 9902 ann_header_desc = {} 9903 for i in range(len(ann_header_match)): 9904 ann_header_info = "".join( 9905 char for char in ann_header_match[i] if char.isalnum() 9906 ) 9907 ann_header.append(ann_header_info) 9908 ann_header_desc[ann_header_info] = ann_header_match[i] 9909 if not ann_header_desc: 9910 raise ValueError("Invalid header description format") 9911 else: 9912 raise ValueError("Invalid header description format") 9913 9914 # Create variant id 9915 variant_id_column = self.get_variant_id_column() 9916 added_columns += [variant_id_column] 9917 9918 # Create dataframe 9919 dataframe_annotation_format = self.get_query_to_df( 9920 f""" SELECT "#CHROM", POS, REF, ALT, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """ 9921 ) 9922 9923 # Create annotation columns 9924 dataframe_annotation_format[ 9925 annotation_format_infos 9926 ] = dataframe_annotation_format[annotation_infos].apply( 9927 lambda x: explode_annotation_format( 9928 annotation=str(x), 9929 uniquify=uniquify, 9930 output_format="JSON", 9931 prefix="", 9932 header=list(ann_header_desc.values()), 9933 ) 9934 ) 9935 9936 # Find keys 9937 query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;""" 9938 df_keys = self.get_query_to_df(query=query_json) 9939 9940 # Check keys 9941 query_json_key = [] 9942 for _, row in df_keys.iterrows(): 9943 9944 # Key 9945 key = row.iloc[0] 9946 9947 # key_clean 9948 key_clean = "".join(char for char in key if char.isalnum()) 9949 9950 # Type 9951 query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');""" 9952 9953 # Get DataFrame from query 9954 df_json_type = self.get_query_to_df(query=query_json_type) 9955 9956 # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN 9957 with pd.option_context("future.no_silent_downcasting", True): 9958 df_json_type.fillna(value="", inplace=True) 9959 replace_dict = {None: np.nan, "": np.nan} 9960 df_json_type.replace(replace_dict, inplace=True) 9961 df_json_type.dropna(inplace=True) 9962 9963 # Detect column type 9964 column_type = detect_column_type(df_json_type[key_clean]) 9965 9966 # Append 9967 query_json_key.append( 9968 f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type} AS '{prefix}{key_clean}' """ 9969 ) 9970 9971 # Create view 9972 query_view = f"""CREATE TEMPORARY TABLE {view_name} AS (SELECT *, {annotation_id} AS 'transcript' FROM (SELECT "#CHROM", POS, REF, ALT, {",".join(query_json_key)} FROM dataframe_annotation_format));""" 9973 self.execute_query(query=query_view) 9974 9975 else: 9976 9977 # Return None 9978 view_name = None 9979 9980 # Remove added columns 9981 for added_column in added_columns: 9982 self.drop_column(column=added_column) 9983 9984 return view_name 9985 9986 def transcript_view_to_variants( 9987 self, 9988 transcripts_table: str = None, 9989 transcripts_column_id: str = None, 9990 transcripts_info_json: str = None, 9991 transcripts_info_field: str = None, 9992 param: dict = {}, 9993 ) -> bool: 9994 """ 9995 The function `transcript_view_to_variants` takes input parameters related to transcripts and updates 9996 a variants table with information from the transcripts in JSON format. 9997 9998 :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the table 9999 containing the transcripts data. If this parameter is not provided, the function will attempt to 10000 retrieve it from the `param` dictionary or use a default value of "transcripts" 10001 :type transcripts_table: str 10002 :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the column in 10003 the `transcripts_table` that contains the unique identifier for each transcript. This identifier is 10004 used to match transcripts with variants in the database 10005 :type transcripts_column_id: str 10006 :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name of 10007 the column in the variants table where the transcripts information will be stored in JSON format 10008 :type transcripts_info_json: str 10009 :param transcripts_info_field: The `transcripts_info_field` parameter is used to specify the field 10010 in the VCF header that will contain information about transcripts in JSON format. This field will be 10011 added to the VCF header as an INFO field with the specified name 10012 :type transcripts_info_field: str 10013 :param param: The `transcript_view_to_variants` method takes several parameters: 10014 :type param: dict 10015 :return: The function `transcript_view_to_variants` returns a boolean value, which is `True` if the 10016 operation is successful and `False` if certain conditions are not met. 10017 """ 10018 10019 log.debug("Start transcripts view to JSON...") 10020 10021 # Default 10022 transcripts_table_default = "transcripts" 10023 transcripts_column_id_default = "transcript" 10024 transcripts_info_json_default = None 10025 transcripts_info_field_default = None 10026 10027 # Param 10028 if not param: 10029 param = self.get_param() 10030 10031 # Transcripts table 10032 if transcripts_table is None: 10033 transcripts_table = param.get("transcripts", {}).get( 10034 "table", transcripts_table_default 10035 ) 10036 10037 # Transcripts column ID 10038 if transcripts_column_id is None: 10039 transcripts_column_id = param.get("transcripts", {}).get( 10040 "column_id", transcripts_column_id_default 10041 ) 10042 10043 # Transcripts info field 10044 if transcripts_info_json is None: 10045 transcripts_info_json = param.get("transcripts", {}).get( 10046 "transcripts_info_json", transcripts_info_json_default 10047 ) 10048 10049 # Transcripts info field 10050 if transcripts_info_field is None: 10051 transcripts_info_field = param.get("transcripts", {}).get( 10052 "transcripts_info_field", transcripts_info_field_default 10053 ) 10054 10055 # Variants table 10056 table_variants = self.get_table_variants() 10057 10058 # Check info columns param 10059 if transcripts_info_json is None and transcripts_info_field is None: 10060 return False 10061 10062 # Transcripts infos columns 10063 query_transcripts_infos_columns = f""" 10064 SELECT * 10065 FROM ( 10066 DESCRIBE SELECT * FROM {transcripts_table} 10067 ) 10068 WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}') 10069 """ 10070 transcripts_infos_columns = list( 10071 self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"] 10072 ) 10073 10074 # View results 10075 clause_select = [] 10076 clause_to_json = [] 10077 for field in transcripts_infos_columns: 10078 clause_select.append( 10079 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10080 ) 10081 clause_to_json.append(f""" '{field}': "{field}" """) 10082 10083 # Update 10084 update_set = [] 10085 10086 # VCF header 10087 vcf_reader = self.get_header() 10088 10089 # Transcripts to info column in JSON 10090 if transcripts_info_json is not None: 10091 10092 # Create column on variants table 10093 self.add_column( 10094 table_name=table_variants, 10095 column_name=transcripts_info_json, 10096 column_type="JSON", 10097 default_value=None, 10098 drop=False, 10099 ) 10100 10101 # Add to update 10102 update_set.append( 10103 f""" {transcripts_info_json}=t.{transcripts_info_json} """ 10104 ) 10105 10106 # Add header 10107 vcf_reader.infos[transcripts_info_json] = vcf.parser._Info( 10108 transcripts_info_json, 10109 ".", 10110 "String", 10111 "Transcripts in JSON format", 10112 "unknwon", 10113 "unknwon", 10114 self.code_type_map["String"], 10115 ) 10116 10117 # Transcripts to info field in JSON 10118 if transcripts_info_field is not None: 10119 10120 # Add to update 10121 update_set.append( 10122 f""" 10123 INFO = concat( 10124 CASE 10125 WHEN INFO NOT IN ('', '.') 10126 THEN INFO 10127 ELSE '' 10128 END, 10129 CASE 10130 WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.') 10131 THEN concat( 10132 ';{transcripts_info_field}=', 10133 t.{transcripts_info_json} 10134 ) 10135 ELSE '' 10136 END 10137 ) 10138 """ 10139 ) 10140 10141 # Add header 10142 vcf_reader.infos[transcripts_info_field] = vcf.parser._Info( 10143 transcripts_info_field, 10144 ".", 10145 "String", 10146 "Transcripts in JSON format", 10147 "unknwon", 10148 "unknwon", 10149 self.code_type_map["String"], 10150 ) 10151 10152 # Update query 10153 query_update = f""" 10154 UPDATE {table_variants} 10155 SET {", ".join(update_set)} 10156 FROM 10157 ( 10158 SELECT 10159 "#CHROM", POS, REF, ALT, 10160 concat( 10161 '{{', 10162 string_agg( 10163 '"' || "{transcripts_column_id}" || '":' || 10164 to_json(json_output) 10165 ), 10166 '}}' 10167 )::JSON AS {transcripts_info_json} 10168 FROM 10169 ( 10170 SELECT 10171 "#CHROM", POS, REF, ALT, 10172 "{transcripts_column_id}", 10173 to_json( 10174 {{{",".join(clause_to_json)}}} 10175 )::JSON AS json_output 10176 FROM 10177 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 10178 WHERE "{transcripts_column_id}" IS NOT NULL 10179 ) 10180 GROUP BY "#CHROM", POS, REF, ALT 10181 ) AS t 10182 WHERE {table_variants}."#CHROM" = t."#CHROM" 10183 AND {table_variants}."POS" = t."POS" 10184 AND {table_variants}."REF" = t."REF" 10185 AND {table_variants}."ALT" = t."ALT" 10186 """ 10187 10188 self.execute_query(query=query_update) 10189 10190 return True
34class Variants: 35 36 def __init__( 37 self, 38 conn=None, 39 input: str = None, 40 output: str = None, 41 config: dict = {}, 42 param: dict = {}, 43 load: bool = False, 44 ) -> None: 45 """ 46 The function `__init__` initializes the variables, sets the input, output, config, param, connexion and 47 header 48 49 :param conn: the connection to the database 50 :param input: the input file 51 :param output: the output file 52 :param config: a dictionary containing the configuration of the model 53 :param param: a dictionary containing the parameters of the model 54 """ 55 56 # Init variables 57 self.init_variables() 58 59 # Input 60 self.set_input(input) 61 62 # Config 63 self.set_config(config) 64 65 # Param 66 self.set_param(param) 67 68 # Output 69 self.set_output(output) 70 71 # connexion 72 self.set_connexion(conn) 73 74 # Header 75 self.set_header() 76 77 # Load data 78 if load: 79 self.load_data() 80 81 def set_input(self, input: str = None) -> None: 82 """ 83 The function `set_input` takes a file name as input, extracts the name and extension, and sets 84 attributes in the class accordingly. 85 86 :param input: The `set_input` method in the provided code snippet is used to set attributes 87 related to the input file. Here's a breakdown of the parameters and their usage in the method: 88 :type input: str 89 """ 90 91 if input and not isinstance(input, str): 92 try: 93 self.input = input.name 94 except: 95 log.error(f"Input file '{input} in bad format") 96 raise ValueError(f"Input file '{input} in bad format") 97 else: 98 self.input = input 99 100 # Input format 101 if input: 102 input_name, input_extension = os.path.splitext(self.input) 103 self.input_name = input_name 104 self.input_extension = input_extension 105 self.input_format = self.input_extension.replace(".", "") 106 107 def set_config(self, config: dict) -> None: 108 """ 109 The set_config function takes a config object and assigns it as the configuration object for the 110 class. 111 112 :param config: The `config` parameter in the `set_config` function is a dictionary object that 113 contains configuration settings for the class. When you call the `set_config` function with a 114 dictionary object as the argument, it will set that dictionary as the configuration object for 115 the class 116 :type config: dict 117 """ 118 119 self.config = config 120 121 def set_param(self, param: dict) -> None: 122 """ 123 This function sets a parameter object for the class based on the input dictionary. 124 125 :param param: The `set_param` method you provided takes a dictionary object as input and sets it 126 as the `param` attribute of the class instance 127 :type param: dict 128 """ 129 130 self.param = param 131 132 def init_variables(self) -> None: 133 """ 134 This function initializes the variables that will be used in the rest of the class 135 """ 136 137 self.prefix = "howard" 138 self.table_variants = "variants" 139 self.dataframe = None 140 141 self.comparison_map = { 142 "gt": ">", 143 "gte": ">=", 144 "lt": "<", 145 "lte": "<=", 146 "equals": "=", 147 "contains": "SIMILAR TO", 148 } 149 150 self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3} 151 152 self.code_type_map_to_sql = { 153 "Integer": "INTEGER", 154 "String": "VARCHAR", 155 "Float": "FLOAT", 156 "Flag": "VARCHAR", 157 } 158 159 self.index_additionnal_fields = [] 160 161 def get_indexing(self) -> bool: 162 """ 163 It returns the value of the key "indexing" in the dictionary. If the key is not present, it 164 returns False. 165 :return: The value of the indexing parameter. 166 """ 167 168 return self.get_param().get("indexing", False) 169 170 def get_connexion_config(self) -> dict: 171 """ 172 The function `get_connexion_config` returns a dictionary containing the configuration for a 173 connection, including the number of threads and memory limit. 174 :return: a dictionary containing the configuration for the Connexion library. 175 """ 176 177 # config 178 config = self.get_config() 179 180 # Connexion config 181 connexion_config = {} 182 threads = self.get_threads() 183 184 # Threads 185 if threads: 186 connexion_config["threads"] = threads 187 188 # Memory 189 # if config.get("memory", None): 190 # connexion_config["memory_limit"] = config.get("memory") 191 if self.get_memory(): 192 connexion_config["memory_limit"] = self.get_memory() 193 194 # Temporary directory 195 if config.get("tmp", None): 196 connexion_config["temp_directory"] = config.get("tmp") 197 198 # Access 199 if config.get("access", None): 200 access = config.get("access") 201 if access in ["RO"]: 202 access = "READ_ONLY" 203 elif access in ["RW"]: 204 access = "READ_WRITE" 205 connexion_db = self.get_connexion_db() 206 if connexion_db in ":memory:": 207 access = "READ_WRITE" 208 connexion_config["access_mode"] = access 209 210 return connexion_config 211 212 def get_duckdb_settings(self) -> dict: 213 """ 214 The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a 215 string. 216 :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`. 217 """ 218 219 # config 220 config = self.get_config() 221 222 # duckdb settings 223 duckdb_settings_dict = {} 224 if config.get("duckdb_settings", None): 225 duckdb_settings = config.get("duckdb_settings") 226 duckdb_settings = full_path(duckdb_settings) 227 # duckdb setting is a file 228 if os.path.exists(duckdb_settings): 229 with open(duckdb_settings) as json_file: 230 duckdb_settings_dict = yaml.safe_load(json_file) 231 # duckdb settings is a string 232 else: 233 duckdb_settings_dict = json.loads(duckdb_settings) 234 235 return duckdb_settings_dict 236 237 def set_connexion_db(self) -> str: 238 """ 239 The function `set_connexion_db` returns the appropriate database connection string based on the 240 input format and connection type. 241 :return: the value of the variable `connexion_db`. 242 """ 243 244 # Default connexion db 245 default_connexion_db = ":memory:" 246 247 # Find connexion db 248 if self.get_input_format() in ["db", "duckdb"]: 249 connexion_db = self.get_input() 250 elif self.get_connexion_type() in ["memory", default_connexion_db, None]: 251 connexion_db = default_connexion_db 252 elif self.get_connexion_type() in ["tmpfile"]: 253 tmp_name = tempfile.mkdtemp( 254 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db" 255 ) 256 connexion_db = f"{tmp_name}/tmp.db" 257 elif self.get_connexion_type() != "": 258 connexion_db = self.get_connexion_type() 259 else: 260 connexion_db = default_connexion_db 261 262 # Set connexion db 263 self.connexion_db = connexion_db 264 265 return connexion_db 266 267 def set_connexion(self, conn) -> None: 268 """ 269 The function `set_connexion` creates a connection to a database, with options for different 270 database formats and settings. 271 272 :param conn: The `conn` parameter in the `set_connexion` method is the connection to the 273 database. If a connection is not provided, a new connection to an in-memory database is created. 274 The method then proceeds to set up the connection based on the specified format (e.g., duckdb or 275 sqlite 276 """ 277 278 # Connexion db 279 connexion_db = self.set_connexion_db() 280 281 # Connexion config 282 connexion_config = self.get_connexion_config() 283 284 # Connexion format 285 connexion_format = self.get_config().get("connexion_format", "duckdb") 286 # Set connexion format 287 self.connexion_format = connexion_format 288 289 # Connexion 290 if not conn: 291 if connexion_format in ["duckdb"]: 292 conn = duckdb.connect(connexion_db, config=connexion_config) 293 # duckDB settings 294 duckdb_settings = self.get_duckdb_settings() 295 if duckdb_settings: 296 for setting in duckdb_settings: 297 setting_value = duckdb_settings.get(setting) 298 if isinstance(setting_value, str): 299 setting_value = f"'{setting_value}'" 300 conn.execute(f"PRAGMA {setting}={setting_value};") 301 elif connexion_format in ["sqlite"]: 302 conn = sqlite3.connect(connexion_db) 303 304 # Set connexion 305 self.conn = conn 306 307 # Log 308 log.debug(f"connexion_format: {connexion_format}") 309 log.debug(f"connexion_db: {connexion_db}") 310 log.debug(f"connexion config: {connexion_config}") 311 log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}") 312 313 def set_output(self, output: str = None) -> None: 314 """ 315 The `set_output` function in Python sets the output file based on the input or a specified key 316 in the config file, extracting the output name, extension, and format. 317 318 :param output: The `output` parameter in the `set_output` method is used to specify the name of 319 the output file. If the config file has an 'output' key, the method sets the output to the value 320 of that key. If no output is provided, it sets the output to `None` 321 :type output: str 322 """ 323 324 if output and not isinstance(output, str): 325 self.output = output.name 326 else: 327 self.output = output 328 329 # Output format 330 if self.output: 331 output_name, output_extension = os.path.splitext(self.output) 332 self.output_name = output_name 333 self.output_extension = output_extension 334 self.output_format = self.output_extension.replace(".", "") 335 else: 336 self.output_name = None 337 self.output_extension = None 338 self.output_format = None 339 340 def set_header(self) -> None: 341 """ 342 It reads the header of a VCF file and stores it as a list of strings and as a VCF object 343 """ 344 345 input_file = self.get_input() 346 default_header_list = [ 347 "##fileformat=VCFv4.2", 348 "#CHROM POS ID REF ALT QUAL FILTER INFO", 349 ] 350 351 # Full path 352 input_file = full_path(input_file) 353 354 if input_file: 355 356 input_format = self.get_input_format() 357 input_compressed = self.get_input_compressed() 358 config = self.get_config() 359 header_list = default_header_list 360 if input_format in [ 361 "vcf", 362 "hdr", 363 "tsv", 364 "csv", 365 "psv", 366 "parquet", 367 "db", 368 "duckdb", 369 ]: 370 # header provided in param 371 if config.get("header_file", None): 372 with open(config.get("header_file"), "rt") as f: 373 header_list = self.read_vcf_header(f) 374 # within a vcf file format (header within input file itsself) 375 elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file): 376 # within a compressed vcf file format (.vcf.gz) 377 if input_compressed: 378 with bgzf.open(input_file, "rt") as f: 379 header_list = self.read_vcf_header(f) 380 # within an uncompressed vcf file format (.vcf) 381 else: 382 with open(input_file, "rt") as f: 383 header_list = self.read_vcf_header(f) 384 # header provided in default external file .hdr 385 elif os.path.exists((input_file + ".hdr")): 386 with open(input_file + ".hdr", "rt") as f: 387 header_list = self.read_vcf_header(f) 388 else: 389 try: # Try to get header info fields and file columns 390 391 with tempfile.TemporaryDirectory() as tmpdir: 392 393 # Create database 394 db_for_header = Database(database=input_file) 395 396 # Get header columns for infos fields 397 db_header_from_columns = ( 398 db_for_header.get_header_from_columns() 399 ) 400 401 # Get real columns in the file 402 db_header_columns = db_for_header.get_columns() 403 404 # Write header file 405 header_file_tmp = os.path.join(tmpdir, "header") 406 f = open(header_file_tmp, "w") 407 vcf.Writer(f, db_header_from_columns) 408 f.close() 409 410 # Replace #CHROM line with rel columns 411 header_list = db_for_header.read_header_file( 412 header_file=header_file_tmp 413 ) 414 header_list[-1] = "\t".join(db_header_columns) 415 416 except: 417 418 log.warning( 419 f"No header for file {input_file}. Set as default VCF header" 420 ) 421 header_list = default_header_list 422 423 else: # try for unknown format ? 424 425 log.error(f"Input file format '{input_format}' not available") 426 raise ValueError(f"Input file format '{input_format}' not available") 427 428 if not header_list: 429 header_list = default_header_list 430 431 # header as list 432 self.header_list = header_list 433 434 # header as VCF object 435 self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list))) 436 437 else: 438 439 self.header_list = None 440 self.header_vcf = None 441 442 def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame: 443 """ 444 The `get_query_to_df` function takes a query as a string and returns the result as a pandas 445 DataFrame based on the connection format. 446 447 :param query: The `query` parameter in the `get_query_to_df` function is a string that 448 represents the SQL query you want to execute. This query will be used to fetch data from a 449 database and convert it into a pandas DataFrame 450 :type query: str 451 :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the 452 maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the 453 function will only fetch up to that number of rows from the database query result. If no limit 454 is specified, 455 :type limit: int 456 :return: A pandas DataFrame is being returned by the `get_query_to_df` function. 457 """ 458 459 # Connexion format 460 connexion_format = self.get_connexion_format() 461 462 # Limit in query 463 if limit: 464 pd.set_option("display.max_rows", limit) 465 if connexion_format in ["duckdb"]: 466 df = ( 467 self.conn.execute(query) 468 .fetch_record_batch(limit) 469 .read_next_batch() 470 .to_pandas() 471 ) 472 elif connexion_format in ["sqlite"]: 473 df = next(pd.read_sql_query(query, self.conn, chunksize=limit)) 474 475 # Full query 476 else: 477 if connexion_format in ["duckdb"]: 478 df = self.conn.execute(query).df() 479 elif connexion_format in ["sqlite"]: 480 df = pd.read_sql_query(query, self.conn) 481 482 return df 483 484 def get_overview(self) -> None: 485 """ 486 The function prints the input, output, config, and dataframe of the current object 487 """ 488 table_variants_from = self.get_table_variants(clause="from") 489 sql_columns = self.get_header_columns_as_sql() 490 sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}" 491 df = self.get_query_to_df(sql_query_export) 492 log.info( 493 "Input: " 494 + str(self.get_input()) 495 + " [" 496 + str(str(self.get_input_format())) 497 + "]" 498 ) 499 log.info( 500 "Output: " 501 + str(self.get_output()) 502 + " [" 503 + str(str(self.get_output_format())) 504 + "]" 505 ) 506 log.info("Config: ") 507 for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split( 508 "\n" 509 ): 510 log.info("\t" + str(d)) 511 log.info("Param: ") 512 for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split( 513 "\n" 514 ): 515 log.info("\t" + str(d)) 516 log.info("Sample list: " + str(self.get_header_sample_list())) 517 log.info("Dataframe: ") 518 for d in str(df).split("\n"): 519 log.info("\t" + str(d)) 520 521 # garbage collector 522 del df 523 gc.collect() 524 525 return None 526 527 def get_stats(self) -> dict: 528 """ 529 The `get_stats` function calculates and returns various statistics of the current object, 530 including information about the input file, variants, samples, header fields, quality, and 531 SNVs/InDels. 532 :return: a dictionary containing various statistics of the current object. The dictionary has 533 the following structure: 534 """ 535 536 # Log 537 log.info(f"Stats Calculation...") 538 539 # table varaints 540 table_variants_from = self.get_table_variants() 541 542 # stats dict 543 stats = {"Infos": {}} 544 545 ### File 546 input_file = self.get_input() 547 stats["Infos"]["Input file"] = input_file 548 549 # Header 550 header_infos = self.get_header().infos 551 header_formats = self.get_header().formats 552 header_infos_list = list(header_infos) 553 header_formats_list = list(header_formats) 554 555 ### Variants 556 557 stats["Variants"] = {} 558 559 # Variants by chr 560 sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"' 561 df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom) 562 nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values( 563 by=["CHROM"], kind="quicksort" 564 ) 565 566 # Total number of variants 567 nb_of_variants = nb_of_variants_by_chrom["count"].sum() 568 569 # Calculate percentage 570 nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply( 571 lambda x: (x / nb_of_variants) 572 ) 573 574 stats["Variants"]["Number of variants by chromosome"] = ( 575 nb_of_variants_by_chrom.to_dict(orient="index") 576 ) 577 578 stats["Infos"]["Number of variants"] = int(nb_of_variants) 579 580 ### Samples 581 582 # Init 583 samples = {} 584 nb_of_samples = 0 585 586 # Check Samples 587 if "GT" in header_formats_list and "FORMAT" in self.get_header_columns(): 588 log.debug(f"Check samples...") 589 for sample in self.get_header_sample_list(): 590 sql_query_samples = f""" 591 SELECT '{sample}' as sample, 592 REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype, 593 count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count, 594 concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage 595 FROM {table_variants_from} 596 WHERE ( 597 regexp_matches("{sample}", '^[0-9]([/|][0-9])+') 598 AND 599 len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':')) 600 ) 601 GROUP BY genotype 602 """ 603 sql_query_genotype_df = self.conn.execute(sql_query_samples).df() 604 sample_genotype_count = sql_query_genotype_df["count"].sum() 605 if len(sql_query_genotype_df): 606 nb_of_samples += 1 607 samples[f"{sample} - {sample_genotype_count} variants"] = ( 608 sql_query_genotype_df.to_dict(orient="index") 609 ) 610 611 stats["Samples"] = samples 612 stats["Infos"]["Number of samples"] = nb_of_samples 613 614 # # 615 # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list: 616 # stats["Infos"]["Number of samples"] = nb_of_samples 617 # elif nb_of_samples: 618 # stats["Infos"]["Number of samples"] = "not a VCF format" 619 620 ### INFO and FORMAT fields 621 header_types_df = {} 622 header_types_list = { 623 "List of INFO fields": header_infos, 624 "List of FORMAT fields": header_formats, 625 } 626 i = 0 627 for header_type in header_types_list: 628 629 header_type_infos = header_types_list.get(header_type) 630 header_infos_dict = {} 631 632 for info in header_type_infos: 633 634 i += 1 635 header_infos_dict[i] = {} 636 637 # ID 638 header_infos_dict[i]["id"] = info 639 640 # num 641 genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"} 642 if header_type_infos[info].num in genotype_map.keys(): 643 header_infos_dict[i]["Number"] = genotype_map.get( 644 header_type_infos[info].num 645 ) 646 else: 647 header_infos_dict[i]["Number"] = header_type_infos[info].num 648 649 # type 650 if header_type_infos[info].type: 651 header_infos_dict[i]["Type"] = header_type_infos[info].type 652 else: 653 header_infos_dict[i]["Type"] = "." 654 655 # desc 656 if header_type_infos[info].desc != None: 657 header_infos_dict[i]["Description"] = header_type_infos[info].desc 658 else: 659 header_infos_dict[i]["Description"] = "" 660 661 if len(header_infos_dict): 662 header_types_df[header_type] = pd.DataFrame.from_dict( 663 header_infos_dict, orient="index" 664 ).to_dict(orient="index") 665 666 # Stats 667 stats["Infos"]["Number of INFO fields"] = len(header_infos_list) 668 stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list) 669 stats["Header"] = header_types_df 670 671 ### QUAL 672 if "QUAL" in self.get_header_columns(): 673 sql_query_qual = f""" 674 SELECT 675 avg(CAST(QUAL AS INTEGER)) AS Average, 676 min(CAST(QUAL AS INTEGER)) AS Minimum, 677 max(CAST(QUAL AS INTEGER)) AS Maximum, 678 stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation, 679 median(CAST(QUAL AS INTEGER)) AS Median, 680 variance(CAST(QUAL AS INTEGER)) AS Variance 681 FROM {table_variants_from} 682 WHERE CAST(QUAL AS VARCHAR) NOT IN ('.') 683 """ 684 685 qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index") 686 stats["Quality"] = {"Stats": qual} 687 688 ### SNV and InDel 689 690 sql_query_snv = f""" 691 692 SELECT Type, count FROM ( 693 694 SELECT 695 'Total' AS Type, 696 count(*) AS count 697 FROM {table_variants_from} 698 699 UNION 700 701 SELECT 702 'MNV' AS Type, 703 count(*) AS count 704 FROM {table_variants_from} 705 WHERE len(REF) > 1 AND len(ALT) > 1 706 AND len(REF) = len(ALT) 707 708 UNION 709 710 SELECT 711 'InDel' AS Type, 712 count(*) AS count 713 FROM {table_variants_from} 714 WHERE len(REF) > 1 OR len(ALT) > 1 715 AND len(REF) != len(ALT) 716 717 UNION 718 719 SELECT 720 'SNV' AS Type, 721 count(*) AS count 722 FROM {table_variants_from} 723 WHERE len(REF) = 1 AND len(ALT) = 1 724 725 ) 726 727 ORDER BY count DESC 728 729 """ 730 snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index") 731 732 sql_query_snv_substitution = f""" 733 SELECT 734 concat(REF, '>', ALT) AS 'Substitution', 735 count(*) AS count 736 FROM {table_variants_from} 737 WHERE len(REF) = 1 AND len(ALT) = 1 738 GROUP BY REF, ALT 739 ORDER BY count(*) DESC 740 """ 741 snv_substitution = ( 742 self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index") 743 ) 744 stats["Variants"]["Counts"] = snv_indel 745 stats["Variants"]["Substitutions"] = snv_substitution 746 747 return stats 748 749 def stats_to_file(self, file: str = None) -> str: 750 """ 751 The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them 752 into a JSON object, and writes the JSON object to the specified file. 753 754 :param file: The `file` parameter is a string that represents the file path where the JSON data 755 will be written 756 :type file: str 757 :return: the name of the file that was written to. 758 """ 759 760 # Get stats 761 stats = self.get_stats() 762 763 # Serializing json 764 json_object = json.dumps(stats, indent=4) 765 766 # Writing to sample.json 767 with open(file, "w") as outfile: 768 outfile.write(json_object) 769 770 return file 771 772 def print_stats(self, output_file: str = None, json_file: str = None) -> None: 773 """ 774 The `print_stats` function generates a markdown file and prints the statistics contained in a 775 JSON file in a formatted manner. 776 777 :param output_file: The `output_file` parameter is a string that specifies the path and filename 778 of the output file where the stats will be printed in Markdown format. If no `output_file` is 779 provided, a temporary directory will be created and the stats will be saved in a file named 780 "stats.md" within that 781 :type output_file: str 782 :param json_file: The `json_file` parameter is a string that represents the path to the JSON 783 file where the statistics will be saved. If no value is provided, a temporary directory will be 784 created and a default file name "stats.json" will be used 785 :type json_file: str 786 :return: The function `print_stats` does not return any value. It has a return type annotation 787 of `None`. 788 """ 789 790 # Full path 791 output_file = full_path(output_file) 792 json_file = full_path(json_file) 793 794 with tempfile.TemporaryDirectory() as tmpdir: 795 796 # Files 797 if not output_file: 798 output_file = os.path.join(tmpdir, "stats.md") 799 if not json_file: 800 json_file = os.path.join(tmpdir, "stats.json") 801 802 # Create folders 803 if not os.path.exists(os.path.dirname(output_file)): 804 Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True) 805 if not os.path.exists(os.path.dirname(json_file)): 806 Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True) 807 808 # Create stats JSON file 809 stats_file = self.stats_to_file(file=json_file) 810 811 # Print stats file 812 with open(stats_file) as f: 813 stats = yaml.safe_load(f) 814 815 # Output 816 output_title = [] 817 output_index = [] 818 output = [] 819 820 # Title 821 output_title.append("# HOWARD Stats") 822 823 # Index 824 output_index.append("## Index") 825 826 # Process sections 827 for section in stats: 828 infos = stats.get(section) 829 section_link = "#" + section.lower().replace(" ", "-") 830 output.append(f"## {section}") 831 output_index.append(f"- [{section}]({section_link})") 832 833 if len(infos): 834 for info in infos: 835 try: 836 df = pd.DataFrame.from_dict(infos.get(info), orient="index") 837 is_df = True 838 except: 839 try: 840 df = pd.DataFrame.from_dict( 841 json.loads((infos.get(info))), orient="index" 842 ) 843 is_df = True 844 except: 845 is_df = False 846 if is_df: 847 output.append(f"### {info}") 848 info_link = "#" + info.lower().replace(" ", "-") 849 output_index.append(f" - [{info}]({info_link})") 850 output.append(f"{df.to_markdown(index=False)}") 851 else: 852 output.append(f"- {info}: {infos.get(info)}") 853 else: 854 output.append(f"NA") 855 856 # Write stats in markdown file 857 with open(output_file, "w") as fp: 858 for item in output_title: 859 fp.write("%s\n" % item) 860 for item in output_index: 861 fp.write("%s\n" % item) 862 for item in output: 863 fp.write("%s\n" % item) 864 865 # Output stats in markdown 866 print("") 867 print("\n\n".join(output_title)) 868 print("") 869 print("\n\n".join(output)) 870 print("") 871 872 return None 873 874 def get_input(self) -> str: 875 """ 876 It returns the value of the input variable. 877 :return: The input is being returned. 878 """ 879 return self.input 880 881 def get_input_format(self, input_file: str = None) -> str: 882 """ 883 This function returns the format of the input variable, either from the provided input file or 884 by prompting for input. 885 886 :param input_file: The `input_file` parameter in the `get_input_format` method is a string that 887 represents the file path of the input file. If no `input_file` is provided when calling the 888 method, it will default to `None` 889 :type input_file: str 890 :return: The format of the input variable is being returned. 891 """ 892 893 if not input_file: 894 input_file = self.get_input() 895 input_format = get_file_format(input_file) 896 return input_format 897 898 def get_input_compressed(self, input_file: str = None) -> str: 899 """ 900 The function `get_input_compressed` returns the format of the input variable after compressing 901 it. 902 903 :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string 904 that represents the file path of the input file. If no `input_file` is provided when calling the 905 method, it will default to `None` and the method will then call `self.get_input()` to 906 :type input_file: str 907 :return: The function `get_input_compressed` returns the compressed format of the input 908 variable. 909 """ 910 911 if not input_file: 912 input_file = self.get_input() 913 input_compressed = get_file_compressed(input_file) 914 return input_compressed 915 916 def get_output(self) -> str: 917 """ 918 It returns the output of the neuron. 919 :return: The output of the neural network. 920 """ 921 922 return self.output 923 924 def get_output_format(self, output_file: str = None) -> str: 925 """ 926 The function `get_output_format` returns the format of the input variable or the output file if 927 provided. 928 929 :param output_file: The `output_file` parameter in the `get_output_format` method is a string 930 that represents the file path of the output file. If no `output_file` is provided when calling 931 the method, it will default to the output obtained from the `get_output` method of the class 932 instance. The 933 :type output_file: str 934 :return: The format of the input variable is being returned. 935 """ 936 937 if not output_file: 938 output_file = self.get_output() 939 output_format = get_file_format(output_file) 940 941 return output_format 942 943 def get_config(self) -> dict: 944 """ 945 It returns the config 946 :return: The config variable is being returned. 947 """ 948 return self.config 949 950 def get_param(self) -> dict: 951 """ 952 It returns the param 953 :return: The param variable is being returned. 954 """ 955 return self.param 956 957 def get_connexion_db(self) -> str: 958 """ 959 It returns the connexion_db attribute of the object 960 :return: The connexion_db is being returned. 961 """ 962 return self.connexion_db 963 964 def get_prefix(self) -> str: 965 """ 966 It returns the prefix of the object. 967 :return: The prefix is being returned. 968 """ 969 return self.prefix 970 971 def get_table_variants(self, clause: str = "select") -> str: 972 """ 973 This function returns the table_variants attribute of the object 974 975 :param clause: the type of clause the table will be used. Either "select" or "from" (optional), 976 defaults to select (optional) 977 :return: The table_variants attribute of the object. 978 """ 979 980 # Access 981 access = self.get_config().get("access", None) 982 983 # Clauses "select", "where", "update" 984 if clause in ["select", "where", "update"]: 985 table_variants = self.table_variants 986 # Clause "from" 987 elif clause in ["from"]: 988 # For Read Only 989 if self.get_input_format() in ["parquet"] and access in ["RO"]: 990 input_file = self.get_input() 991 table_variants = f"'{input_file}' as variants" 992 # For Read Write 993 else: 994 table_variants = f"{self.table_variants} as variants" 995 else: 996 table_variants = self.table_variants 997 return table_variants 998 999 def get_tmp_dir(self) -> str: 1000 """ 1001 The function `get_tmp_dir` returns the temporary directory path based on configuration 1002 parameters or a default path. 1003 :return: The `get_tmp_dir` method is returning the temporary directory path based on the 1004 configuration, parameters, and a default value of "/tmp". 1005 """ 1006 1007 return get_tmp( 1008 config=self.get_config(), param=self.get_param(), default_tmp="/tmp" 1009 ) 1010 1011 def get_connexion_type(self) -> str: 1012 """ 1013 If the connexion type is not in the list of allowed connexion types, raise a ValueError 1014 1015 :return: The connexion type is being returned. 1016 """ 1017 return self.get_config().get("connexion_type", "memory") 1018 1019 def get_connexion(self): 1020 """ 1021 It returns the connection object 1022 1023 :return: The connection object. 1024 """ 1025 return self.conn 1026 1027 def close_connexion(self) -> None: 1028 """ 1029 This function closes the connection to the database. 1030 :return: The connection is being closed. 1031 """ 1032 return self.conn.close() 1033 1034 def get_header(self, type: str = "vcf"): 1035 """ 1036 This function returns the header of the VCF file as a list of strings 1037 1038 :param type: the type of header you want to get, defaults to vcf (optional) 1039 :return: The header of the vcf file. 1040 """ 1041 1042 if self.header_vcf: 1043 if type == "vcf": 1044 return self.header_vcf 1045 elif type == "list": 1046 return self.header_list 1047 else: 1048 if type == "vcf": 1049 header = vcf.Reader(io.StringIO("\n".join(vcf_required))) 1050 return header 1051 elif type == "list": 1052 return vcf_required 1053 1054 def get_header_length(self, file: str = None) -> int: 1055 """ 1056 The function `get_header_length` returns the length of the header list, excluding the #CHROM 1057 line. 1058 1059 :param file: The `file` parameter is an optional argument that specifies the path to a VCF 1060 header file. If this argument is provided, the function will read the header from the specified 1061 file and return the length of the header list minus 1 (to exclude the #CHROM line) 1062 :type file: str 1063 :return: the length of the header list, excluding the #CHROM line. 1064 """ 1065 1066 if file: 1067 return len(self.read_vcf_header_file(file=file)) - 1 1068 elif self.get_header(type="list"): 1069 return len(self.get_header(type="list")) - 1 1070 else: 1071 return 0 1072 1073 def get_header_columns(self) -> str: 1074 """ 1075 This function returns the header list of a VCF 1076 1077 :return: The length of the header list. 1078 """ 1079 if self.get_header(): 1080 return self.get_header(type="list")[-1] 1081 else: 1082 return "" 1083 1084 def get_header_columns_as_list(self) -> list: 1085 """ 1086 This function returns the header list of a VCF 1087 1088 :return: The length of the header list. 1089 """ 1090 if self.get_header(): 1091 return self.get_header_columns().strip().split("\t") 1092 else: 1093 return [] 1094 1095 def get_header_columns_as_sql(self) -> str: 1096 """ 1097 This function retruns header length (without #CHROM line) 1098 1099 :return: The length of the header list. 1100 """ 1101 sql_column_list = [] 1102 for col in self.get_header_columns_as_list(): 1103 sql_column_list.append(f'"{col}"') 1104 return ",".join(sql_column_list) 1105 1106 def get_header_sample_list(self) -> list: 1107 """ 1108 This function retruns header length (without #CHROM line) 1109 1110 :return: The length of the header list. 1111 """ 1112 return self.header_vcf.samples 1113 1114 def get_verbose(self) -> bool: 1115 """ 1116 It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't 1117 exist 1118 1119 :return: The value of the key "verbose" in the config dictionary. 1120 """ 1121 return self.get_config().get("verbose", False) 1122 1123 def get_connexion_format(self) -> str: 1124 """ 1125 It returns the connexion format of the object. 1126 :return: The connexion_format is being returned. 1127 """ 1128 connexion_format = self.connexion_format 1129 if connexion_format not in ["duckdb", "sqlite"]: 1130 log.error(f"Unknown connexion format {connexion_format}") 1131 raise ValueError(f"Unknown connexion format {connexion_format}") 1132 else: 1133 return connexion_format 1134 1135 def insert_file_to_table( 1136 self, 1137 file, 1138 columns: str, 1139 header_len: int = 0, 1140 sep: str = "\t", 1141 chunksize: int = 1000000, 1142 ) -> None: 1143 """ 1144 The function reads a file in chunks and inserts each chunk into a table based on the specified 1145 database format. 1146 1147 :param file: The `file` parameter is the file that you want to load into a table. It should be 1148 the path to the file on your system 1149 :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that 1150 should contain the names of the columns in the table where the data will be inserted. The column 1151 names should be separated by commas within the string. For example, if you have columns named 1152 "id", "name 1153 :type columns: str 1154 :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies 1155 the number of lines to skip at the beginning of the file before reading the actual data. This 1156 parameter allows you to skip any header information present in the file before processing the 1157 data, defaults to 0 1158 :type header_len: int (optional) 1159 :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the 1160 separator character that is used in the file being read. In this case, the default separator is 1161 set to `\t`, which represents a tab character. You can change this parameter to a different 1162 separator character if, defaults to \t 1163 :type sep: str (optional) 1164 :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time 1165 when processing the file in chunks. In the provided code snippet, the default value for 1166 `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults 1167 to 1000000 1168 :type chunksize: int (optional) 1169 """ 1170 1171 # Config 1172 chunksize = self.get_config().get("load", {}).get("chunk", chunksize) 1173 connexion_format = self.get_connexion_format() 1174 1175 log.debug("chunksize: " + str(chunksize)) 1176 1177 if chunksize: 1178 for chunk in pd.read_csv( 1179 file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c" 1180 ): 1181 if connexion_format in ["duckdb"]: 1182 sql_insert_into = ( 1183 f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk" 1184 ) 1185 self.conn.execute(sql_insert_into) 1186 elif connexion_format in ["sqlite"]: 1187 chunk.to_sql("variants", self.conn, if_exists="append", index=False) 1188 1189 def load_data( 1190 self, 1191 input_file: str = None, 1192 drop_variants_table: bool = False, 1193 sample_size: int = 20480, 1194 ) -> None: 1195 """ 1196 The `load_data` function reads a VCF file and inserts it into a table, with options to drop the 1197 table before loading the data and specify a sample size. 1198 1199 :param input_file: The path to the input file. This is the VCF file that will be loaded into the 1200 table 1201 :type input_file: str 1202 :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that 1203 determines whether the variants table should be dropped before loading the data. If set to 1204 `True`, the variants table will be dropped. If set to `False` (default), the variants table will 1205 not be dropped, defaults to False 1206 :type drop_variants_table: bool (optional) 1207 :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from 1208 the input file. If it is set to `None`, the default value of 20480 will be used, defaults to 1209 20480 1210 :type sample_size: int (optional) 1211 """ 1212 1213 log.info("Loading...") 1214 1215 # change input file 1216 if input_file: 1217 self.set_input(input_file) 1218 self.set_header() 1219 1220 # drop variants table 1221 if drop_variants_table: 1222 self.drop_variants_table() 1223 1224 # get table variants 1225 table_variants = self.get_table_variants() 1226 1227 # Access 1228 access = self.get_config().get("access", None) 1229 log.debug(f"access: {access}") 1230 1231 # Input format and compress 1232 input_format = self.get_input_format() 1233 input_compressed = self.get_input_compressed() 1234 log.debug(f"input_format: {input_format}") 1235 log.debug(f"input_compressed: {input_compressed}") 1236 1237 # input_compressed_format 1238 if input_compressed: 1239 input_compressed_format = "gzip" 1240 else: 1241 input_compressed_format = "none" 1242 log.debug(f"input_compressed_format: {input_compressed_format}") 1243 1244 # Connexion format 1245 connexion_format = self.get_connexion_format() 1246 1247 # Sample size 1248 if not sample_size: 1249 sample_size = -1 1250 log.debug(f"sample_size: {sample_size}") 1251 1252 # Load data 1253 log.debug(f"Load Data from {input_format}") 1254 1255 # DuckDB connexion 1256 if connexion_format in ["duckdb"]: 1257 1258 # Database already exists 1259 if self.input_format in ["db", "duckdb"]: 1260 1261 if connexion_format in ["duckdb"]: 1262 log.debug(f"Input file format '{self.input_format}' duckDB") 1263 else: 1264 log.error( 1265 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1266 ) 1267 raise ValueError( 1268 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1269 ) 1270 1271 # Load from existing database format 1272 else: 1273 1274 try: 1275 # Create Table or View 1276 database = Database(database=self.input) 1277 sql_from = database.get_sql_from(sample_size=sample_size) 1278 1279 if access in ["RO"]: 1280 sql_load = ( 1281 f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}" 1282 ) 1283 else: 1284 sql_load = ( 1285 f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}" 1286 ) 1287 self.conn.execute(sql_load) 1288 1289 except: 1290 # Format not available 1291 log.error(f"Input file format '{self.input_format}' not available") 1292 raise ValueError( 1293 f"Input file format '{self.input_format}' not available" 1294 ) 1295 1296 # SQLite connexion 1297 elif connexion_format in ["sqlite"] and input_format in [ 1298 "vcf", 1299 "tsv", 1300 "csv", 1301 "psv", 1302 ]: 1303 1304 # Main structure 1305 structure = { 1306 "#CHROM": "VARCHAR", 1307 "POS": "INTEGER", 1308 "ID": "VARCHAR", 1309 "REF": "VARCHAR", 1310 "ALT": "VARCHAR", 1311 "QUAL": "VARCHAR", 1312 "FILTER": "VARCHAR", 1313 "INFO": "VARCHAR", 1314 } 1315 1316 # Strcuture with samples 1317 structure_complete = structure 1318 if self.get_header_sample_list(): 1319 structure["FORMAT"] = "VARCHAR" 1320 for sample in self.get_header_sample_list(): 1321 structure_complete[sample] = "VARCHAR" 1322 1323 # Columns list for create and insert 1324 sql_create_table_columns = [] 1325 sql_create_table_columns_list = [] 1326 for column in structure_complete: 1327 column_type = structure_complete[column] 1328 sql_create_table_columns.append( 1329 f'"{column}" {column_type} default NULL' 1330 ) 1331 sql_create_table_columns_list.append(f'"{column}"') 1332 1333 # Create database 1334 log.debug(f"Create Table {table_variants}") 1335 sql_create_table_columns_sql = ", ".join(sql_create_table_columns) 1336 sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list) 1337 sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})" 1338 self.conn.execute(sql_create_table) 1339 1340 # chunksize define length of file chunk load file 1341 chunksize = 100000 1342 1343 # delimiter 1344 delimiter = file_format_delimiters.get(input_format, "\t") 1345 1346 # Load the input file 1347 with open(self.input, "rt") as input_file: 1348 1349 # Use the appropriate file handler based on the input format 1350 if input_compressed: 1351 input_file = bgzf.open(self.input, "rt") 1352 if input_format in ["vcf"]: 1353 header_len = self.get_header_length() 1354 else: 1355 header_len = 0 1356 1357 # Insert the file contents into a table 1358 self.insert_file_to_table( 1359 input_file, 1360 columns=sql_create_table_columns_list_sql, 1361 header_len=header_len, 1362 sep=delimiter, 1363 chunksize=chunksize, 1364 ) 1365 1366 else: 1367 log.error( 1368 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1369 ) 1370 raise ValueError( 1371 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1372 ) 1373 1374 # Explode INFOS fields into table fields 1375 if self.get_explode_infos(): 1376 self.explode_infos( 1377 prefix=self.get_explode_infos_prefix(), 1378 fields=self.get_explode_infos_fields(), 1379 force=True, 1380 ) 1381 1382 # Create index after insertion 1383 self.create_indexes() 1384 1385 def get_explode_infos(self) -> bool: 1386 """ 1387 The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting 1388 to False if it is not set. 1389 :return: The method is returning the value of the "explode_infos" parameter, which is a boolean 1390 value. If the parameter is not present, it will return False. 1391 """ 1392 1393 return self.get_param().get("explode", {}).get("explode_infos", False) 1394 1395 def get_explode_infos_fields( 1396 self, 1397 explode_infos_fields: str = None, 1398 remove_fields_not_in_header: bool = False, 1399 ) -> list: 1400 """ 1401 The `get_explode_infos_fields` function returns a list of exploded information fields based on 1402 the input parameter `explode_infos_fields`. 1403 1404 :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the 1405 fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a 1406 comma-separated list of field names to explode 1407 :type explode_infos_fields: str 1408 :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean 1409 flag that determines whether to remove fields that are not present in the header. If it is set 1410 to `True`, any field that is not in the header will be excluded from the list of exploded 1411 information fields. If it is set to `, defaults to False 1412 :type remove_fields_not_in_header: bool (optional) 1413 :return: The function `get_explode_infos_fields` returns a list of exploded information fields. 1414 If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty 1415 list. If the parameter is provided and its value is "ALL", it also returns an empty list. 1416 Otherwise, it returns a list of exploded information fields after removing any spaces and 1417 splitting the string by commas. 1418 """ 1419 1420 # If no fields, get it in param 1421 if not explode_infos_fields: 1422 explode_infos_fields = ( 1423 self.get_param().get("explode", {}).get("explode_infos_fields", None) 1424 ) 1425 1426 # If no fields, defined as all fields in header using keyword 1427 if not explode_infos_fields: 1428 explode_infos_fields = "*" 1429 1430 # If fields list not empty 1431 if explode_infos_fields: 1432 1433 # Input fields list 1434 if isinstance(explode_infos_fields, str): 1435 fields_input = explode_infos_fields.split(",") 1436 elif isinstance(explode_infos_fields, list): 1437 fields_input = explode_infos_fields 1438 else: 1439 fields_input = [] 1440 1441 # Fields list without * keyword 1442 fields_without_all = fields_input.copy() 1443 if "*".casefold() in (item.casefold() for item in fields_without_all): 1444 fields_without_all.remove("*") 1445 1446 # Fields in header 1447 fields_in_header = sorted(list(set(self.get_header().infos))) 1448 1449 # Construct list of fields 1450 fields_output = [] 1451 for field in fields_input: 1452 1453 # Strip field 1454 field = field.strip() 1455 1456 # format keyword * in regex 1457 if field.upper() in ["*"]: 1458 field = ".*" 1459 1460 # Find all fields with pattern 1461 r = re.compile(field) 1462 fields_search = sorted(list(filter(r.match, fields_in_header))) 1463 1464 # Remove fields input from search 1465 if field in fields_search: 1466 fields_search = [field] 1467 elif fields_search != [field]: 1468 fields_search = sorted( 1469 list(set(fields_search).difference(fields_input)) 1470 ) 1471 1472 # If field is not in header (avoid not well formatted header) 1473 if not fields_search and not remove_fields_not_in_header: 1474 fields_search = [field] 1475 1476 # Add found fields 1477 for new_field in fields_search: 1478 # Add field, if not already exists, and if it is in header (if asked) 1479 if ( 1480 new_field not in fields_output 1481 and ( 1482 not remove_fields_not_in_header 1483 or new_field in fields_in_header 1484 ) 1485 and new_field not in [".*"] 1486 ): 1487 fields_output.append(new_field) 1488 1489 return fields_output 1490 1491 else: 1492 1493 return [] 1494 1495 def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str: 1496 """ 1497 The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or 1498 the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is 1499 not provided. 1500 1501 :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a 1502 prefix to be used for exploding or expanding information 1503 :type explode_infos_prefix: str 1504 :return: the value of the variable `explode_infos_prefix`. 1505 """ 1506 1507 if not explode_infos_prefix: 1508 explode_infos_prefix = ( 1509 self.get_param().get("explode", {}).get("explode_infos_prefix", "") 1510 ) 1511 1512 return explode_infos_prefix 1513 1514 def add_column( 1515 self, 1516 table_name, 1517 column_name, 1518 column_type, 1519 default_value=None, 1520 drop: bool = False, 1521 ) -> dict: 1522 """ 1523 The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it 1524 doesn't already exist. 1525 1526 :param table_name: The name of the table to which you want to add a column 1527 :param column_name: The parameter "column_name" is the name of the column that you want to add 1528 to the table 1529 :param column_type: The `column_type` parameter specifies the data type of the column that you 1530 want to add to the table. It should be a string that represents the desired data type, such as 1531 "INTEGER", "TEXT", "REAL", etc 1532 :param default_value: The `default_value` parameter is an optional parameter that specifies the 1533 default value for the newly added column. If a default value is provided, it will be assigned to 1534 the column for any existing rows that do not have a value for that column 1535 :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column 1536 if it already exists in the table. If `drop` is set to `True`, the function will drop the 1537 existing column before adding the new column. If `drop` is set to `False` (default),, defaults 1538 to False 1539 :type drop: bool (optional) 1540 :return: a boolean value indicating whether the column was successfully added to the table. 1541 """ 1542 1543 # added 1544 added = False 1545 dropped = False 1546 1547 # Check if the column already exists in the table 1548 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1549 columns = self.get_query_to_df(query).columns.tolist() 1550 if column_name.upper() in [c.upper() for c in columns]: 1551 log.debug( 1552 f"The {column_name} column already exists in the {table_name} table" 1553 ) 1554 if drop: 1555 self.drop_column(table_name=table_name, column_name=column_name) 1556 dropped = True 1557 else: 1558 return None 1559 else: 1560 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1561 1562 # Add column in table 1563 add_column_query = ( 1564 f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """ 1565 ) 1566 if default_value is not None: 1567 add_column_query += f" DEFAULT {default_value}" 1568 self.execute_query(add_column_query) 1569 added = not dropped 1570 log.debug( 1571 f"The {column_name} column was successfully added to the {table_name} table" 1572 ) 1573 1574 if added: 1575 added_column = { 1576 "table_name": table_name, 1577 "column_name": column_name, 1578 "column_type": column_type, 1579 "default_value": default_value, 1580 } 1581 else: 1582 added_column = None 1583 1584 return added_column 1585 1586 def drop_column( 1587 self, column: dict = None, table_name: str = None, column_name: str = None 1588 ) -> bool: 1589 """ 1590 The `drop_column` function drops a specified column from a given table in a database and returns 1591 True if the column was successfully dropped, and False if the column does not exist in the 1592 table. 1593 1594 :param column: The `column` parameter is a dictionary that contains information about the column 1595 you want to drop. It has two keys: 1596 :type column: dict 1597 :param table_name: The `table_name` parameter is the name of the table from which you want to 1598 drop a column 1599 :type table_name: str 1600 :param column_name: The `column_name` parameter is the name of the column that you want to drop 1601 from the table 1602 :type column_name: str 1603 :return: a boolean value. It returns True if the column was successfully dropped from the table, 1604 and False if the column does not exist in the table. 1605 """ 1606 1607 # Find column infos 1608 if column: 1609 if isinstance(column, dict): 1610 table_name = column.get("table_name", None) 1611 column_name = column.get("column_name", None) 1612 elif isinstance(column, str): 1613 table_name = self.get_table_variants() 1614 column_name = column 1615 else: 1616 table_name = None 1617 column_name = None 1618 1619 if not table_name and not column_name: 1620 return False 1621 1622 # Removed 1623 removed = False 1624 1625 # Check if the column already exists in the table 1626 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1627 columns = self.get_query_to_df(query).columns.tolist() 1628 if column_name in columns: 1629 log.debug(f"The {column_name} column exists in the {table_name} table") 1630 else: 1631 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1632 return False 1633 1634 # Add column in table # ALTER TABLE integers DROP k 1635 add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """ 1636 self.execute_query(add_column_query) 1637 removed = True 1638 log.debug( 1639 f"The {column_name} column was successfully dropped to the {table_name} table" 1640 ) 1641 1642 return removed 1643 1644 def explode_infos( 1645 self, 1646 prefix: str = None, 1647 create_index: bool = False, 1648 fields: list = None, 1649 force: bool = False, 1650 proccess_all_fields_together: bool = False, 1651 table: str = None, 1652 ) -> list: 1653 """ 1654 The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into 1655 individual columns, returning a list of added columns. 1656 1657 :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO 1658 fields. If the `prefix` is not provided or is set to `None`, the function will use the value of 1659 `self.get_explode_infos_prefix()` as the prefix 1660 :type prefix: str 1661 :param create_index: The `create_index` parameter is a boolean flag that specifies whether to 1662 create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to 1663 `False`, indexes will not be created. The default value is `False`, defaults to False 1664 :type create_index: bool (optional) 1665 :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields 1666 that you want to explode into individual columns. If this parameter is not provided, all INFO 1667 fields will be exploded. You can specify the INFO fields you want to explode by passing them as 1668 a list to the ` 1669 :type fields: list 1670 :param force: The `force` parameter in the `explode_infos` function is a boolean flag that 1671 determines whether to drop and recreate a column if it already exists in the table. If `force` 1672 is set to `True`, the column will be dropped and recreated. If `force` is set to `False, 1673 defaults to False 1674 :type force: bool (optional) 1675 :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean 1676 flag that determines whether to process all the INFO fields together or individually. If set to 1677 `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will 1678 be processed individually. The default value is, defaults to False 1679 :type proccess_all_fields_together: bool (optional) 1680 :param table: The `table` parameter in the `explode_infos` function is used to specify the name 1681 of the table where the exploded INFO fields will be added as individual columns. If you provide 1682 a value for the `table` parameter, the function will use that table name. If the `table` 1683 parameter is 1684 :type table: str 1685 :return: The `explode_infos` function returns a list of added columns. 1686 """ 1687 1688 # drop indexes 1689 self.drop_indexes() 1690 1691 # connexion format 1692 connexion_format = self.get_connexion_format() 1693 1694 # Access 1695 access = self.get_config().get("access", None) 1696 1697 # Added columns 1698 added_columns = [] 1699 1700 if access not in ["RO"]: 1701 1702 # prefix 1703 if prefix in [None, True] or not isinstance(prefix, str): 1704 if self.get_explode_infos_prefix() not in [None, True]: 1705 prefix = self.get_explode_infos_prefix() 1706 else: 1707 prefix = "INFO/" 1708 1709 # table variants 1710 if table is not None: 1711 table_variants = table 1712 else: 1713 table_variants = self.get_table_variants(clause="select") 1714 1715 # extra infos 1716 try: 1717 extra_infos = self.get_extra_infos() 1718 except: 1719 extra_infos = [] 1720 1721 # Header infos 1722 header_infos = self.get_header().infos 1723 1724 log.debug( 1725 f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields" 1726 ) 1727 1728 sql_info_alter_table_array = [] 1729 1730 # Info fields to check 1731 fields_list = list(header_infos) 1732 if fields: 1733 fields_list += fields 1734 fields_list = set(fields_list) 1735 1736 # If no fields 1737 if not fields: 1738 fields = [] 1739 1740 # Translate fields if patterns 1741 fields = self.get_explode_infos_fields(explode_infos_fields=fields) 1742 1743 for info in fields: 1744 1745 info_id_sql = prefix + info 1746 1747 if ( 1748 info in fields_list 1749 or prefix + info in fields_list 1750 or info in extra_infos 1751 ): 1752 1753 log.debug(f"Explode INFO fields - ADD '{info}' annotations fields") 1754 1755 if info in header_infos: 1756 info_type = header_infos[info].type 1757 info_num = header_infos[info].num 1758 else: 1759 info_type = "String" 1760 info_num = 0 1761 1762 type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR") 1763 if info_num != 1: 1764 type_sql = "VARCHAR" 1765 1766 # Add field 1767 added_column = self.add_column( 1768 table_name=table_variants, 1769 column_name=info_id_sql, 1770 column_type=type_sql, 1771 default_value="null", 1772 drop=force, 1773 ) 1774 1775 if added_column: 1776 added_columns.append(added_column) 1777 1778 if added_column or force: 1779 1780 # add field to index 1781 self.index_additionnal_fields.append(info_id_sql) 1782 1783 # Update field array 1784 if connexion_format in ["duckdb"]: 1785 update_info_field = f""" 1786 "{info_id_sql}" = 1787 CASE 1788 WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL 1789 ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) 1790 END 1791 """ 1792 elif connexion_format in ["sqlite"]: 1793 update_info_field = f""" 1794 "{info_id_sql}" = 1795 CASE 1796 WHEN instr(INFO, '{info}=') = 0 THEN NULL 1797 WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1) 1798 ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1) 1799 END 1800 """ 1801 1802 sql_info_alter_table_array.append(update_info_field) 1803 1804 if sql_info_alter_table_array: 1805 1806 # By chromosomes 1807 try: 1808 chromosomes_list = list( 1809 self.get_query_to_df( 1810 f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """ 1811 )["#CHROM"] 1812 ) 1813 except: 1814 chromosomes_list = [None] 1815 1816 for chrom in chromosomes_list: 1817 log.debug(f"Explode INFO fields - Chromosome {chrom}...") 1818 1819 # Where clause 1820 where_clause = "" 1821 if chrom and len(chromosomes_list) > 1: 1822 where_clause = f""" WHERE "#CHROM" = '{chrom}' """ 1823 1824 # Update table 1825 if proccess_all_fields_together: 1826 sql_info_alter_table_array_join = ", ".join( 1827 sql_info_alter_table_array 1828 ) 1829 if sql_info_alter_table_array_join: 1830 sql_info_alter_table = f""" 1831 UPDATE {table_variants} 1832 SET {sql_info_alter_table_array_join} 1833 {where_clause} 1834 """ 1835 log.debug( 1836 f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..." 1837 ) 1838 # log.debug(sql_info_alter_table) 1839 self.conn.execute(sql_info_alter_table) 1840 else: 1841 sql_info_alter_num = 0 1842 for sql_info_alter in sql_info_alter_table_array: 1843 sql_info_alter_num += 1 1844 sql_info_alter_table = f""" 1845 UPDATE {table_variants} 1846 SET {sql_info_alter} 1847 {where_clause} 1848 """ 1849 log.debug( 1850 f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..." 1851 ) 1852 # log.debug(sql_info_alter_table) 1853 self.conn.execute(sql_info_alter_table) 1854 1855 # create indexes 1856 if create_index: 1857 self.create_indexes() 1858 1859 return added_columns 1860 1861 def create_indexes(self) -> None: 1862 """ 1863 Create indexes on the table after insertion 1864 """ 1865 1866 # Access 1867 access = self.get_config().get("access", None) 1868 1869 # get table variants 1870 table_variants = self.get_table_variants("FROM") 1871 1872 if self.get_indexing() and access not in ["RO"]: 1873 # Create index 1874 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")' 1875 self.conn.execute(sql_create_table_index) 1876 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")' 1877 self.conn.execute(sql_create_table_index) 1878 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")' 1879 self.conn.execute(sql_create_table_index) 1880 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")' 1881 self.conn.execute(sql_create_table_index) 1882 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")' 1883 self.conn.execute(sql_create_table_index) 1884 for field in self.index_additionnal_fields: 1885 sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """ 1886 self.conn.execute(sql_create_table_index) 1887 1888 def drop_indexes(self) -> None: 1889 """ 1890 Create indexes on the table after insertion 1891 """ 1892 1893 # Access 1894 access = self.get_config().get("access", None) 1895 1896 # get table variants 1897 table_variants = self.get_table_variants("FROM") 1898 1899 # Get database format 1900 connexion_format = self.get_connexion_format() 1901 1902 if access not in ["RO"]: 1903 if connexion_format in ["duckdb"]: 1904 sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'" 1905 elif connexion_format in ["sqlite"]: 1906 sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';" 1907 1908 list_indexes = self.conn.execute(sql_list_indexes) 1909 index_names = [row[0] for row in list_indexes.fetchall()] 1910 for index in index_names: 1911 sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """ 1912 self.conn.execute(sql_drop_table_index) 1913 1914 def read_vcf_header(self, f) -> list: 1915 """ 1916 It reads the header of a VCF file and returns a list of the header lines 1917 1918 :param f: the file object 1919 :return: The header lines of the VCF file. 1920 """ 1921 1922 header_list = [] 1923 for line in f: 1924 header_list.append(line) 1925 if line.startswith("#CHROM"): 1926 break 1927 return header_list 1928 1929 def read_vcf_header_file(self, file: str = None) -> list: 1930 """ 1931 The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and 1932 uncompressed files. 1933 1934 :param file: The `file` parameter is a string that represents the path to the VCF header file 1935 that you want to read. It is an optional parameter, so if you don't provide a value, it will 1936 default to `None` 1937 :type file: str 1938 :return: The function `read_vcf_header_file` returns a list. 1939 """ 1940 1941 if self.get_input_compressed(input_file=file): 1942 with bgzf.open(file, "rt") as f: 1943 return self.read_vcf_header(f=f) 1944 else: 1945 with open(file, "rt") as f: 1946 return self.read_vcf_header(f=f) 1947 1948 def execute_query(self, query: str): 1949 """ 1950 It takes a query as an argument, executes it, and returns the results 1951 1952 :param query: The query to be executed 1953 :return: The result of the query is being returned. 1954 """ 1955 if query: 1956 return self.conn.execute(query) # .fetchall() 1957 else: 1958 return None 1959 1960 def export_output( 1961 self, 1962 output_file: str | None = None, 1963 output_header: str | None = None, 1964 export_header: bool = True, 1965 query: str | None = None, 1966 parquet_partitions: list | None = None, 1967 chunk_size: int | None = None, 1968 threads: int | None = None, 1969 sort: bool = False, 1970 index: bool = False, 1971 order_by: str | None = None, 1972 ) -> bool: 1973 """ 1974 The `export_output` function exports data from a VCF file to a specified output file in various 1975 formats, including VCF, CSV, TSV, PSV, and Parquet. 1976 1977 :param output_file: The `output_file` parameter is a string that specifies the name of the 1978 output file to be generated by the function. This is where the exported data will be saved 1979 :type output_file: str 1980 :param output_header: The `output_header` parameter is a string that specifies the name of the 1981 file where the header of the VCF file will be exported. If this parameter is not provided, the 1982 header will be exported to a file with the same name as the `output_file` parameter, but with 1983 the extension " 1984 :type output_header: str 1985 :param export_header: The `export_header` parameter is a boolean flag that determines whether 1986 the header of a VCF file should be exported to a separate file or not. If `export_header` is 1987 True, the header will be exported to a file. If `export_header` is False, the header will not 1988 be, defaults to True, if output format is not VCF 1989 :type export_header: bool (optional) 1990 :param query: The `query` parameter is an optional SQL query that can be used to filter and 1991 select specific data from the VCF file before exporting it. If provided, only the data that 1992 matches the query will be exported 1993 :type query: str 1994 :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the 1995 columns to be used for partitioning the Parquet file during export. Partitioning is a way to 1996 organize data in a hierarchical directory structure based on the values of one or more columns. 1997 This can improve query performance when working with large datasets 1998 :type parquet_partitions: list 1999 :param chunk_size: The `chunk_size` parameter specifies the number of 2000 records in batch when exporting data in Parquet format. This parameter is used for 2001 partitioning the Parquet file into multiple files. 2002 :type chunk_size: int 2003 :param threads: The `threads` parameter is an optional parameter that specifies the number of 2004 threads to be used during the export process. It determines the level of parallelism and can 2005 improve the performance of the export operation. If not provided, the function will use the 2006 default number of threads 2007 :type threads: int 2008 :param sort: The `sort` parameter is a boolean flag that determines whether the output file 2009 should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the 2010 genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to 2011 False 2012 :type sort: bool (optional) 2013 :param index: The `index` parameter is a boolean flag that determines whether an index should be 2014 created on the output file. If `index` is True, an index will be created. If `index` is False, 2015 no index will be created. The default value is False, defaults to False 2016 :type index: bool (optional) 2017 :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for 2018 sorting the output file. This parameter is only applicable when exporting data in VCF format 2019 :type order_by: str 2020 :return: a boolean value. It checks if the output file exists and returns True if it does, or 2021 None if it doesn't. 2022 """ 2023 2024 # Log 2025 log.info("Exporting...") 2026 2027 # Full path 2028 output_file = full_path(output_file) 2029 output_header = full_path(output_header) 2030 2031 # Config 2032 config = self.get_config() 2033 2034 # Param 2035 param = self.get_param() 2036 2037 # Tmp files to remove 2038 tmp_to_remove = [] 2039 2040 # If no output, get it 2041 if not output_file: 2042 output_file = self.get_output() 2043 2044 # If not threads 2045 if not threads: 2046 threads = self.get_threads() 2047 2048 # Auto header name with extension 2049 if export_header or output_header: 2050 if not output_header: 2051 output_header = f"{output_file}.hdr" 2052 # Export header 2053 self.export_header(output_file=output_file) 2054 2055 # Switch off export header if VCF output 2056 output_file_type = get_file_format(output_file) 2057 if output_file_type in ["vcf"]: 2058 export_header = False 2059 tmp_to_remove.append(output_header) 2060 2061 # Chunk size 2062 if not chunk_size: 2063 chunk_size = config.get("chunk_size", None) 2064 2065 # Parquet partition 2066 if not parquet_partitions: 2067 parquet_partitions = param.get("export", {}).get("parquet_partitions", None) 2068 if parquet_partitions and isinstance(parquet_partitions, str): 2069 parquet_partitions = parquet_partitions.split(",") 2070 2071 # Order by 2072 if not order_by: 2073 order_by = param.get("export", {}).get("order_by", "") 2074 2075 # Header in output 2076 header_in_output = param.get("export", {}).get("include_header", False) 2077 2078 # Database 2079 database_source = self.get_connexion() 2080 2081 # Connexion format 2082 connexion_format = self.get_connexion_format() 2083 2084 # Explode infos 2085 if self.get_explode_infos(): 2086 self.explode_infos( 2087 prefix=self.get_explode_infos_prefix(), 2088 fields=self.get_explode_infos_fields(), 2089 force=False, 2090 ) 2091 2092 # if connexion_format in ["sqlite"] or query: 2093 if connexion_format in ["sqlite"]: 2094 2095 # Export in Parquet 2096 random_tmp = "".join( 2097 random.choice(string.ascii_lowercase) for i in range(10) 2098 ) 2099 database_source = f"""{output_file}.{random_tmp}.database_export.parquet""" 2100 tmp_to_remove.append(database_source) 2101 2102 # Table Variants 2103 table_variants = self.get_table_variants() 2104 2105 # Create export query 2106 sql_query_export_subquery = f""" 2107 SELECT * FROM {table_variants} 2108 """ 2109 2110 # Write source file 2111 fp.write(database_source, self.get_query_to_df(sql_query_export_subquery)) 2112 2113 # Create database 2114 database = Database( 2115 database=database_source, 2116 table="variants", 2117 header_file=output_header, 2118 conn_config=self.get_connexion_config(), 2119 ) 2120 2121 # Existing colomns header 2122 # existing_columns_header = database.get_header_file_columns(output_header) 2123 existing_columns_header = database.get_header_columns_from_database() 2124 2125 # Export file 2126 database.export( 2127 output_database=output_file, 2128 output_header=output_header, 2129 existing_columns_header=existing_columns_header, 2130 parquet_partitions=parquet_partitions, 2131 chunk_size=chunk_size, 2132 threads=threads, 2133 sort=sort, 2134 index=index, 2135 header_in_output=header_in_output, 2136 order_by=order_by, 2137 query=query, 2138 export_header=export_header, 2139 ) 2140 2141 # Remove 2142 remove_if_exists(tmp_to_remove) 2143 2144 return (os.path.exists(output_file) or None) and ( 2145 os.path.exists(output_file) or None 2146 ) 2147 2148 def get_extra_infos(self, table: str = None) -> list: 2149 """ 2150 The `get_extra_infos` function returns a list of columns that are in a specified table but not 2151 in the header. 2152 2153 :param table: The `table` parameter in the `get_extra_infos` function is used to specify the 2154 name of the table from which you want to retrieve the extra columns that are not present in the 2155 header. If the `table` parameter is not provided when calling the function, it will default to 2156 using the variants 2157 :type table: str 2158 :return: A list of columns that are in the specified table but not in the header of the table. 2159 """ 2160 2161 header_columns = [] 2162 2163 if not table: 2164 table = self.get_table_variants(clause="from") 2165 header_columns = self.get_header_columns() 2166 2167 # Check all columns in the database 2168 query = f""" SELECT * FROM {table} LIMIT 1 """ 2169 log.debug(f"query {query}") 2170 table_columns = self.get_query_to_df(query).columns.tolist() 2171 extra_columns = [] 2172 2173 # Construct extra infos (not in header) 2174 for column in table_columns: 2175 if column not in header_columns: 2176 extra_columns.append(column) 2177 2178 return extra_columns 2179 2180 def get_extra_infos_sql(self, table: str = None) -> str: 2181 """ 2182 It returns a string of the extra infos, separated by commas, and each extra info is surrounded 2183 by double quotes 2184 2185 :param table: The name of the table to get the extra infos from. If None, the default table is 2186 used 2187 :type table: str 2188 :return: A string of the extra infos 2189 """ 2190 2191 return ", ".join( 2192 ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)] 2193 ) 2194 2195 def export_header( 2196 self, 2197 header_name: str = None, 2198 output_file: str = None, 2199 output_file_ext: str = ".hdr", 2200 clean_header: bool = True, 2201 remove_chrom_line: bool = False, 2202 ) -> str: 2203 """ 2204 The `export_header` function takes a VCF file, extracts the header, modifies it according to 2205 specified options, and writes it to a new file. 2206 2207 :param header_name: The `header_name` parameter is the name of the header file to be created. If 2208 this parameter is not specified, the header will be written to the output file 2209 :type header_name: str 2210 :param output_file: The `output_file` parameter in the `export_header` function is used to 2211 specify the name of the output file where the header will be written. If this parameter is not 2212 provided, the header will be written to a temporary file 2213 :type output_file: str 2214 :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a 2215 string that represents the extension of the output header file. By default, it is set to ".hdr" 2216 if not specified by the user. This extension will be appended to the `output_file` name to 2217 create the final, defaults to .hdr 2218 :type output_file_ext: str (optional) 2219 :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean 2220 flag that determines whether the header should be cleaned or not. When `clean_header` is set to 2221 `True`, the function will clean the header by modifying certain lines based on a specific 2222 pattern. If `clean_header`, defaults to True 2223 :type clean_header: bool (optional) 2224 :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a 2225 boolean flag that determines whether the #CHROM line should be removed from the header before 2226 writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `, 2227 defaults to False 2228 :type remove_chrom_line: bool (optional) 2229 :return: The function `export_header` returns the name of the temporary header file that is 2230 created. 2231 """ 2232 2233 if not header_name and not output_file: 2234 output_file = self.get_output() 2235 2236 if self.get_header(): 2237 2238 # Get header object 2239 header_obj = self.get_header() 2240 2241 # Create database 2242 db_for_header = Database(database=self.get_input()) 2243 2244 # Get real columns in the file 2245 db_header_columns = db_for_header.get_columns() 2246 2247 with tempfile.TemporaryDirectory() as tmpdir: 2248 2249 # Write header file 2250 header_file_tmp = os.path.join(tmpdir, "header") 2251 f = open(header_file_tmp, "w") 2252 vcf.Writer(f, header_obj) 2253 f.close() 2254 2255 # Replace #CHROM line with rel columns 2256 header_list = db_for_header.read_header_file( 2257 header_file=header_file_tmp 2258 ) 2259 header_list[-1] = "\t".join(db_header_columns) 2260 2261 # Remove CHROM line 2262 if remove_chrom_line: 2263 header_list.pop() 2264 2265 # Clean header 2266 if clean_header: 2267 header_list_clean = [] 2268 for head in header_list: 2269 # Clean head for malformed header 2270 head_clean = head 2271 head_clean = re.subn( 2272 "##FORMAT=<ID=(.*),Number=(.*),Type=Flag", 2273 r"##FORMAT=<ID=\1,Number=\2,Type=String", 2274 head_clean, 2275 2, 2276 )[0] 2277 # Write header 2278 header_list_clean.append(head_clean) 2279 header_list = header_list_clean 2280 2281 tmp_header_name = output_file + output_file_ext 2282 2283 f = open(tmp_header_name, "w") 2284 for line in header_list: 2285 f.write(line) 2286 f.close() 2287 2288 return tmp_header_name 2289 2290 def export_variant_vcf( 2291 self, 2292 vcf_file, 2293 remove_info: bool = False, 2294 add_samples: bool = True, 2295 list_samples: list = [], 2296 where_clause: str = "", 2297 index: bool = False, 2298 threads: int | None = None, 2299 ) -> bool | None: 2300 """ 2301 The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to 2302 remove INFO field, add samples, and control compression and indexing. 2303 2304 :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be 2305 written to. It is the output file that will contain the filtered VCF data based on the specified 2306 parameters 2307 :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a 2308 boolean flag that determines whether to remove the INFO field from the output VCF file. If set 2309 to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included 2310 in, defaults to False 2311 :type remove_info: bool (optional) 2312 :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether 2313 the samples should be added to the VCF file or not. If set to True, the samples will be added. 2314 If set to False, the samples will be removed. The default value is True, defaults to True 2315 :type add_samples: bool (optional) 2316 :param list_samples: The `list_samples` parameter is a list of samples that you want to include 2317 in the output VCF file. By default, all samples will be included. If you provide a list of 2318 samples, only those samples will be included in the output file 2319 :type list_samples: list 2320 :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that 2321 determines whether or not to create an index for the output VCF file. If `index` is set to 2322 `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False 2323 :type index: bool (optional) 2324 :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the 2325 number of threads to use for exporting the VCF file. It determines how many parallel threads 2326 will be used during the export process. More threads can potentially speed up the export process 2327 by utilizing multiple cores of the processor. If 2328 :type threads: int | None 2329 :return: The `export_variant_vcf` function returns the result of calling the `export_output` 2330 method with various parameters including the output file, query, threads, sort flag, and index 2331 flag. The `export_output` method is responsible for exporting the VCF data based on the 2332 specified parameters and configurations provided in the `export_variant_vcf` function. 2333 """ 2334 2335 # Config 2336 config = self.get_config() 2337 2338 # Extract VCF 2339 log.debug("Export VCF...") 2340 2341 # Table variants 2342 table_variants = self.get_table_variants() 2343 2344 # Threads 2345 if not threads: 2346 threads = self.get_threads() 2347 2348 # Info fields 2349 if remove_info: 2350 if not isinstance(remove_info, str): 2351 remove_info = "." 2352 info_field = f"""'{remove_info}' as INFO""" 2353 else: 2354 info_field = "INFO" 2355 2356 # Samples fields 2357 if add_samples: 2358 if not list_samples: 2359 list_samples = self.get_header_sample_list() 2360 if list_samples: 2361 samples_fields = " , FORMAT , " + " , ".join(list_samples) 2362 else: 2363 samples_fields = "" 2364 log.debug(f"samples_fields: {samples_fields}") 2365 else: 2366 samples_fields = "" 2367 2368 # Where clause 2369 if where_clause is None: 2370 where_clause = "" 2371 2372 # Variants 2373 select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """ 2374 sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """ 2375 log.debug(f"sql_query_select={sql_query_select}") 2376 2377 return self.export_output( 2378 output_file=vcf_file, 2379 output_header=None, 2380 export_header=True, 2381 query=sql_query_select, 2382 parquet_partitions=None, 2383 chunk_size=config.get("chunk_size", None), 2384 threads=threads, 2385 sort=True, 2386 index=index, 2387 order_by=None, 2388 ) 2389 2390 def run_commands(self, commands: list = [], threads: int = 1) -> None: 2391 """ 2392 It takes a list of commands and runs them in parallel using the number of threads specified 2393 2394 :param commands: A list of commands to run 2395 :param threads: The number of threads to use, defaults to 1 (optional) 2396 """ 2397 2398 run_parallel_commands(commands, threads) 2399 2400 def get_threads(self, default: int = 1) -> int: 2401 """ 2402 This function returns the number of threads to use for a job, with a default value of 1 if not 2403 specified. 2404 2405 :param default: The `default` parameter in the `get_threads` method is used to specify the 2406 default number of threads to use if no specific value is provided. If no value is provided for 2407 the `threads` parameter in the configuration or input parameters, the `default` value will be 2408 used, defaults to 1 2409 :type default: int (optional) 2410 :return: the number of threads to use for the current job. 2411 """ 2412 2413 # Config 2414 config = self.get_config() 2415 2416 # Param 2417 param = self.get_param() 2418 2419 # Input threads 2420 input_thread = param.get("threads", config.get("threads", None)) 2421 2422 # Check threads 2423 if not input_thread: 2424 threads = default 2425 elif int(input_thread) <= 0: 2426 threads = os.cpu_count() 2427 else: 2428 threads = int(input_thread) 2429 return threads 2430 2431 def get_memory(self, default: str = None) -> str: 2432 """ 2433 This function retrieves the memory value from parameters or configuration with a default value 2434 if not found. 2435 2436 :param default: The `get_memory` function takes in a default value as a string parameter. This 2437 default value is used as a fallback in case the `memory` parameter is not provided in the 2438 `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary, 2439 the function 2440 :type default: str 2441 :return: The `get_memory` function returns a string value representing the memory parameter. If 2442 the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will 2443 return the default value provided as an argument to the function. 2444 """ 2445 2446 # Config 2447 config = self.get_config() 2448 2449 # Param 2450 param = self.get_param() 2451 2452 # Input threads 2453 input_memory = param.get("memory", config.get("memory", None)) 2454 2455 # Check threads 2456 if input_memory: 2457 memory = input_memory 2458 else: 2459 memory = default 2460 2461 return memory 2462 2463 def update_from_vcf(self, vcf_file: str) -> None: 2464 """ 2465 > If the database is duckdb, then use the parquet method, otherwise use the sqlite method 2466 2467 :param vcf_file: the path to the VCF file 2468 """ 2469 2470 connexion_format = self.get_connexion_format() 2471 2472 if connexion_format in ["duckdb"]: 2473 self.update_from_vcf_duckdb(vcf_file) 2474 elif connexion_format in ["sqlite"]: 2475 self.update_from_vcf_sqlite(vcf_file) 2476 2477 def update_from_vcf_duckdb(self, vcf_file: str) -> None: 2478 """ 2479 It takes a VCF file and updates the INFO column of the variants table in the database with the 2480 INFO column of the VCF file 2481 2482 :param vcf_file: the path to the VCF file 2483 """ 2484 2485 # varaints table 2486 table_variants = self.get_table_variants() 2487 2488 # Loading VCF into temporaire table 2489 skip = self.get_header_length(file=vcf_file) 2490 vcf_df = pd.read_csv( 2491 vcf_file, 2492 sep="\t", 2493 engine="c", 2494 skiprows=skip, 2495 header=0, 2496 low_memory=False, 2497 ) 2498 sql_query_update = f""" 2499 UPDATE {table_variants} as table_variants 2500 SET INFO = concat( 2501 CASE 2502 WHEN INFO NOT IN ('', '.') 2503 THEN INFO 2504 ELSE '' 2505 END, 2506 ( 2507 SELECT 2508 concat( 2509 CASE 2510 WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.') 2511 THEN ';' 2512 ELSE '' 2513 END 2514 , 2515 CASE 2516 WHEN table_parquet.INFO NOT IN ('','.') 2517 THEN table_parquet.INFO 2518 ELSE '' 2519 END 2520 ) 2521 FROM vcf_df as table_parquet 2522 WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR) 2523 AND table_parquet.\"POS\" = table_variants.\"POS\" 2524 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 2525 AND table_parquet.\"REF\" = table_variants.\"REF\" 2526 AND table_parquet.INFO NOT IN ('','.') 2527 ) 2528 ) 2529 ; 2530 """ 2531 self.conn.execute(sql_query_update) 2532 2533 def update_from_vcf_sqlite(self, vcf_file: str) -> None: 2534 """ 2535 It creates a temporary table in the SQLite database, loads the VCF file into the temporary 2536 table, then updates the INFO column of the variants table with the INFO column of the temporary 2537 table 2538 2539 :param vcf_file: The path to the VCF file you want to update the database with 2540 """ 2541 2542 # Create a temporary table for the VCF 2543 table_vcf = "tmp_vcf" 2544 sql_create = ( 2545 f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0" 2546 ) 2547 self.conn.execute(sql_create) 2548 2549 # Loading VCF into temporaire table 2550 vcf_df = pd.read_csv( 2551 vcf_file, sep="\t", comment="#", header=None, low_memory=False 2552 ) 2553 vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] 2554 vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False) 2555 2556 # Update table 'variants' with VCF data 2557 # warning: CONCAT as || operator 2558 sql_query_update = f""" 2559 UPDATE variants as table_variants 2560 SET INFO = CASE 2561 WHEN INFO NOT IN ('', '.') 2562 THEN INFO 2563 ELSE '' 2564 END || 2565 ( 2566 SELECT 2567 CASE 2568 WHEN table_variants.INFO NOT IN ('','.') 2569 AND table_vcf.INFO NOT IN ('','.') 2570 THEN ';' 2571 ELSE '' 2572 END || 2573 CASE 2574 WHEN table_vcf.INFO NOT IN ('','.') 2575 THEN table_vcf.INFO 2576 ELSE '' 2577 END 2578 FROM {table_vcf} as table_vcf 2579 WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\" 2580 AND table_vcf.\"POS\" = table_variants.\"POS\" 2581 AND table_vcf.\"ALT\" = table_variants.\"ALT\" 2582 AND table_vcf.\"REF\" = table_variants.\"REF\" 2583 ) 2584 """ 2585 self.conn.execute(sql_query_update) 2586 2587 # Drop temporary table 2588 sql_drop = f"DROP TABLE {table_vcf}" 2589 self.conn.execute(sql_drop) 2590 2591 def drop_variants_table(self) -> None: 2592 """ 2593 > This function drops the variants table 2594 """ 2595 2596 table_variants = self.get_table_variants() 2597 sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}" 2598 self.conn.execute(sql_table_variants) 2599 2600 def set_variant_id( 2601 self, variant_id_column: str = "variant_id", force: bool = None 2602 ) -> str: 2603 """ 2604 It adds a column to the variants table called `variant_id` and populates it with a hash of the 2605 `#CHROM`, `POS`, `REF`, and `ALT` columns 2606 2607 :param variant_id_column: The name of the column to be created in the variants table, defaults 2608 to variant_id 2609 :type variant_id_column: str (optional) 2610 :param force: If True, the variant_id column will be created even if it already exists 2611 :type force: bool 2612 :return: The name of the column that contains the variant_id 2613 """ 2614 2615 # Assembly 2616 assembly = self.get_param().get( 2617 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 2618 ) 2619 2620 # INFO/Tag prefix 2621 prefix = self.get_explode_infos_prefix() 2622 2623 # Explode INFO/SVTYPE 2624 added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"]) 2625 2626 # variants table 2627 table_variants = self.get_table_variants() 2628 2629 # variant_id column 2630 if not variant_id_column: 2631 variant_id_column = "variant_id" 2632 2633 # Creta variant_id column 2634 if "variant_id" not in self.get_extra_infos() or force: 2635 2636 # Create column 2637 self.add_column( 2638 table_name=table_variants, 2639 column_name=variant_id_column, 2640 column_type="UBIGINT", 2641 default_value="0", 2642 ) 2643 2644 # Update column 2645 self.conn.execute( 2646 f""" 2647 UPDATE {table_variants} 2648 SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"') 2649 """ 2650 ) 2651 2652 # Remove added columns 2653 for added_column in added_columns: 2654 self.drop_column(column=added_column) 2655 2656 # return variant_id column name 2657 return variant_id_column 2658 2659 def get_variant_id_column( 2660 self, variant_id_column: str = "variant_id", force: bool = None 2661 ) -> str: 2662 """ 2663 This function returns the variant_id column name 2664 2665 :param variant_id_column: The name of the column in the dataframe that contains the variant IDs, 2666 defaults to variant_id 2667 :type variant_id_column: str (optional) 2668 :param force: If True, will force the variant_id to be set to the value of variant_id_column. If 2669 False, will only set the variant_id if it is not already set. If None, will set the variant_id 2670 if it is not already set, or if it is set 2671 :type force: bool 2672 :return: The variant_id column name. 2673 """ 2674 2675 return self.set_variant_id(variant_id_column=variant_id_column, force=force) 2676 2677 ### 2678 # Annotation 2679 ### 2680 2681 def scan_databases( 2682 self, 2683 database_formats: list = ["parquet"], 2684 database_releases: list = ["current"], 2685 ) -> dict: 2686 """ 2687 The function `scan_databases` scans for available databases based on specified formats and 2688 releases. 2689 2690 :param database_formats: The `database_formats` parameter is a list that specifies the formats 2691 of the databases to be scanned. In this case, the accepted format is "parquet" 2692 :type database_formats: list ["parquet"] 2693 :param database_releases: The `database_releases` parameter is a list that specifies the 2694 releases of the databases to be scanned. In the provided function, the default value for 2695 `database_releases` is set to `["current"]`, meaning that by default, the function will scan 2696 databases that are in the "current" 2697 :type database_releases: list 2698 :return: The function `scan_databases` returns a dictionary containing information about 2699 databases that match the specified formats and releases. 2700 """ 2701 2702 # Config 2703 config = self.get_config() 2704 2705 # Param 2706 param = self.get_param() 2707 2708 # Param - Assembly 2709 assembly = param.get("assembly", config.get("assembly", None)) 2710 if not assembly: 2711 assembly = DEFAULT_ASSEMBLY 2712 log.warning(f"Default assembly '{assembly}'") 2713 2714 # Scan for availabled databases 2715 log.info( 2716 f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..." 2717 ) 2718 databases_infos_dict = databases_infos( 2719 database_folder_releases=database_releases, 2720 database_formats=database_formats, 2721 assembly=assembly, 2722 config=config, 2723 ) 2724 log.info( 2725 f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found" 2726 ) 2727 2728 return databases_infos_dict 2729 2730 def annotation(self) -> None: 2731 """ 2732 It annotates the VCF file with the annotations specified in the config file. 2733 """ 2734 2735 # Config 2736 config = self.get_config() 2737 2738 # Param 2739 param = self.get_param() 2740 2741 # Param - Assembly 2742 assembly = param.get("assembly", config.get("assembly", None)) 2743 if not assembly: 2744 assembly = DEFAULT_ASSEMBLY 2745 log.warning(f"Default assembly '{assembly}'") 2746 2747 # annotations databases folders 2748 annotations_databases = set( 2749 config.get("folders", {}) 2750 .get("databases", {}) 2751 .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER]) 2752 + config.get("folders", {}) 2753 .get("databases", {}) 2754 .get("parquet", ["~/howard/databases/parquet/current"]) 2755 + config.get("folders", {}) 2756 .get("databases", {}) 2757 .get("bcftools", ["~/howard/databases/bcftools/current"]) 2758 ) 2759 2760 # Get param annotations 2761 if param.get("annotations", None) and isinstance( 2762 param.get("annotations", None), str 2763 ): 2764 log.debug(param.get("annotations", None)) 2765 param_annotation_list = param.get("annotations").split(",") 2766 else: 2767 param_annotation_list = [] 2768 2769 # Each tools param 2770 if param.get("annotation_parquet", None) != None: 2771 log.debug( 2772 f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}""" 2773 ) 2774 if isinstance(param.get("annotation_parquet", None), list): 2775 param_annotation_list.append(",".join(param.get("annotation_parquet"))) 2776 else: 2777 param_annotation_list.append(param.get("annotation_parquet")) 2778 if param.get("annotation_snpsift", None) != None: 2779 if isinstance(param.get("annotation_snpsift", None), list): 2780 param_annotation_list.append( 2781 "snpsift:" 2782 + "+".join(param.get("annotation_snpsift")).replace(",", "+") 2783 ) 2784 else: 2785 param_annotation_list.append( 2786 "snpsift:" + param.get("annotation_snpsift").replace(",", "+") 2787 ) 2788 if param.get("annotation_snpeff", None) != None: 2789 param_annotation_list.append("snpeff:" + param.get("annotation_snpeff")) 2790 if param.get("annotation_bcftools", None) != None: 2791 if isinstance(param.get("annotation_bcftools", None), list): 2792 param_annotation_list.append( 2793 "bcftools:" 2794 + "+".join(param.get("annotation_bcftools")).replace(",", "+") 2795 ) 2796 else: 2797 param_annotation_list.append( 2798 "bcftools:" + param.get("annotation_bcftools").replace(",", "+") 2799 ) 2800 if param.get("annotation_annovar", None) != None: 2801 param_annotation_list.append("annovar:" + param.get("annotation_annovar")) 2802 if param.get("annotation_exomiser", None) != None: 2803 param_annotation_list.append("exomiser:" + param.get("annotation_exomiser")) 2804 if param.get("annotation_splice", None) != None: 2805 param_annotation_list.append("splice:" + param.get("annotation_splice")) 2806 2807 # Merge param annotations list 2808 param["annotations"] = ",".join(param_annotation_list) 2809 2810 # debug 2811 log.debug(f"param_annotations={param['annotations']}") 2812 2813 if param.get("annotations"): 2814 2815 # Log 2816 # log.info("Annotations - Check annotation parameters") 2817 2818 if not "annotation" in param: 2819 param["annotation"] = {} 2820 2821 # List of annotations parameters 2822 annotations_list_input = {} 2823 if isinstance(param.get("annotations", None), str): 2824 annotation_file_list = [ 2825 value for value in param.get("annotations", "").split(",") 2826 ] 2827 for annotation_file in annotation_file_list: 2828 annotations_list_input[annotation_file] = {"INFO": None} 2829 else: 2830 annotations_list_input = param.get("annotations", {}) 2831 2832 log.info(f"Quick Annotations:") 2833 for annotation_key in list(annotations_list_input.keys()): 2834 log.info(f" {annotation_key}") 2835 2836 # List of annotations and associated fields 2837 annotations_list = {} 2838 2839 for annotation_file in annotations_list_input: 2840 2841 # Explode annotations if ALL 2842 if ( 2843 annotation_file.upper() == "ALL" 2844 or annotation_file.upper().startswith("ALL:") 2845 ): 2846 2847 # check ALL parameters (formats, releases) 2848 annotation_file_split = annotation_file.split(":") 2849 database_formats = "parquet" 2850 database_releases = "current" 2851 for annotation_file_option in annotation_file_split[1:]: 2852 database_all_options_split = annotation_file_option.split("=") 2853 if database_all_options_split[0] == "format": 2854 database_formats = database_all_options_split[1].split("+") 2855 if database_all_options_split[0] == "release": 2856 database_releases = database_all_options_split[1].split("+") 2857 2858 # Scan for availabled databases 2859 databases_infos_dict = self.scan_databases( 2860 database_formats=database_formats, 2861 database_releases=database_releases, 2862 ) 2863 2864 # Add found databases in annotation parameters 2865 for database_infos in databases_infos_dict.keys(): 2866 annotations_list[database_infos] = {"INFO": None} 2867 2868 else: 2869 annotations_list[annotation_file] = annotations_list_input[ 2870 annotation_file 2871 ] 2872 2873 # Check each databases 2874 if len(annotations_list): 2875 2876 log.info( 2877 f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..." 2878 ) 2879 2880 for annotation_file in annotations_list: 2881 2882 # Init 2883 annotations = annotations_list.get(annotation_file, None) 2884 2885 # Annotation snpEff 2886 if annotation_file.startswith("snpeff"): 2887 2888 log.debug(f"Quick Annotation snpEff") 2889 2890 if "snpeff" not in param["annotation"]: 2891 param["annotation"]["snpeff"] = {} 2892 2893 if "options" not in param["annotation"]["snpeff"]: 2894 param["annotation"]["snpeff"]["options"] = "" 2895 2896 # snpEff options in annotations 2897 param["annotation"]["snpeff"]["options"] = "".join( 2898 annotation_file.split(":")[1:] 2899 ) 2900 2901 # Annotation Annovar 2902 elif annotation_file.startswith("annovar"): 2903 2904 log.debug(f"Quick Annotation Annovar") 2905 2906 if "annovar" not in param["annotation"]: 2907 param["annotation"]["annovar"] = {} 2908 2909 if "annotations" not in param["annotation"]["annovar"]: 2910 param["annotation"]["annovar"]["annotations"] = {} 2911 2912 # Options 2913 annotation_file_split = annotation_file.split(":") 2914 for annotation_file_annotation in annotation_file_split[1:]: 2915 if annotation_file_annotation: 2916 param["annotation"]["annovar"]["annotations"][ 2917 annotation_file_annotation 2918 ] = annotations 2919 2920 # Annotation Exomiser 2921 elif annotation_file.startswith("exomiser"): 2922 2923 log.debug(f"Quick Annotation Exomiser") 2924 2925 param["annotation"]["exomiser"] = params_string_to_dict( 2926 annotation_file 2927 ) 2928 2929 # Annotation Splice 2930 elif annotation_file.startswith("splice"): 2931 2932 log.debug(f"Quick Annotation Splice") 2933 2934 param["annotation"]["splice"] = params_string_to_dict( 2935 annotation_file 2936 ) 2937 2938 # Annotation Parquet or BCFTOOLS 2939 else: 2940 2941 # Tools detection 2942 if annotation_file.startswith("bcftools:"): 2943 annotation_tool_initial = "bcftools" 2944 annotation_file = ":".join(annotation_file.split(":")[1:]) 2945 elif annotation_file.startswith("snpsift:"): 2946 annotation_tool_initial = "snpsift" 2947 annotation_file = ":".join(annotation_file.split(":")[1:]) 2948 else: 2949 annotation_tool_initial = None 2950 2951 # list of files 2952 annotation_file_list = annotation_file.replace("+", ":").split( 2953 ":" 2954 ) 2955 2956 for annotation_file in annotation_file_list: 2957 2958 if annotation_file: 2959 2960 # Annotation tool initial 2961 annotation_tool = annotation_tool_initial 2962 2963 # Find file 2964 annotation_file_found = None 2965 2966 # Expand user 2967 annotation_file = full_path(annotation_file) 2968 2969 if os.path.exists(annotation_file): 2970 annotation_file_found = annotation_file 2971 2972 else: 2973 # Find within assembly folders 2974 for annotations_database in annotations_databases: 2975 found_files = find_all( 2976 annotation_file, 2977 os.path.join( 2978 annotations_database, assembly 2979 ), 2980 ) 2981 if len(found_files) > 0: 2982 annotation_file_found = found_files[0] 2983 break 2984 if not annotation_file_found and not assembly: 2985 # Find within folders 2986 for ( 2987 annotations_database 2988 ) in annotations_databases: 2989 found_files = find_all( 2990 annotation_file, annotations_database 2991 ) 2992 if len(found_files) > 0: 2993 annotation_file_found = found_files[0] 2994 break 2995 log.debug( 2996 f"for {annotation_file} annotation_file_found={annotation_file_found}" 2997 ) 2998 2999 # Full path 3000 annotation_file_found = full_path(annotation_file_found) 3001 3002 if annotation_file_found: 3003 3004 database = Database(database=annotation_file_found) 3005 quick_annotation_format = database.get_format() 3006 quick_annotation_is_compressed = ( 3007 database.is_compressed() 3008 ) 3009 quick_annotation_is_indexed = os.path.exists( 3010 f"{annotation_file_found}.tbi" 3011 ) 3012 bcftools_preference = False 3013 3014 # Check Annotation Tool 3015 if not annotation_tool: 3016 if ( 3017 bcftools_preference 3018 and quick_annotation_format 3019 in ["vcf", "bed"] 3020 and quick_annotation_is_compressed 3021 and quick_annotation_is_indexed 3022 ): 3023 annotation_tool = "bcftools" 3024 elif quick_annotation_format in [ 3025 "vcf", 3026 "bed", 3027 "tsv", 3028 "tsv", 3029 "csv", 3030 "json", 3031 "tbl", 3032 "parquet", 3033 "duckdb", 3034 ]: 3035 annotation_tool = "parquet" 3036 else: 3037 log.error( 3038 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3039 ) 3040 raise ValueError( 3041 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3042 ) 3043 3044 log.debug( 3045 f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}" 3046 ) 3047 3048 # Annotation Tool dispatch 3049 if annotation_tool: 3050 if annotation_tool not in param["annotation"]: 3051 param["annotation"][annotation_tool] = {} 3052 if ( 3053 "annotations" 3054 not in param["annotation"][annotation_tool] 3055 ): 3056 param["annotation"][annotation_tool][ 3057 "annotations" 3058 ] = {} 3059 param["annotation"][annotation_tool][ 3060 "annotations" 3061 ][annotation_file_found] = annotations 3062 3063 else: 3064 log.error( 3065 f"Quick Annotation File {annotation_file} does NOT exist" 3066 ) 3067 3068 self.set_param(param) 3069 3070 if param.get("annotation", None): 3071 log.info("Annotations") 3072 if param.get("annotation", {}).get("parquet", None): 3073 log.info("Annotations 'parquet'...") 3074 self.annotation_parquet() 3075 if param.get("annotation", {}).get("bcftools", None): 3076 log.info("Annotations 'bcftools'...") 3077 self.annotation_bcftools() 3078 if param.get("annotation", {}).get("snpsift", None): 3079 log.info("Annotations 'snpsift'...") 3080 self.annotation_snpsift() 3081 if param.get("annotation", {}).get("annovar", None): 3082 log.info("Annotations 'annovar'...") 3083 self.annotation_annovar() 3084 if param.get("annotation", {}).get("snpeff", None): 3085 log.info("Annotations 'snpeff'...") 3086 self.annotation_snpeff() 3087 if param.get("annotation", {}).get("exomiser", None) is not None: 3088 log.info("Annotations 'exomiser'...") 3089 self.annotation_exomiser() 3090 if param.get("annotation", {}).get("splice", None) is not None: 3091 log.info("Annotations 'splice' ...") 3092 self.annotation_splice() 3093 3094 # Explode INFOS fields into table fields 3095 if self.get_explode_infos(): 3096 self.explode_infos( 3097 prefix=self.get_explode_infos_prefix(), 3098 fields=self.get_explode_infos_fields(), 3099 force=True, 3100 ) 3101 3102 def annotation_snpsift(self, threads: int = None) -> None: 3103 """ 3104 This function annotate with bcftools 3105 3106 :param threads: Number of threads to use 3107 :return: the value of the variable "return_value". 3108 """ 3109 3110 # DEBUG 3111 log.debug("Start annotation with bcftools databases") 3112 3113 # Threads 3114 if not threads: 3115 threads = self.get_threads() 3116 log.debug("Threads: " + str(threads)) 3117 3118 # Config 3119 config = self.get_config() 3120 log.debug("Config: " + str(config)) 3121 3122 # Config - snpSift 3123 snpsift_bin_command = get_bin_command( 3124 bin="SnpSift.jar", 3125 tool="snpsift", 3126 bin_type="jar", 3127 config=config, 3128 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 3129 ) 3130 if not snpsift_bin_command: 3131 msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'" 3132 log.error(msg_err) 3133 raise ValueError(msg_err) 3134 3135 # Config - bcftools 3136 bcftools_bin_command = get_bin_command( 3137 bin="bcftools", 3138 tool="bcftools", 3139 bin_type="bin", 3140 config=config, 3141 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3142 ) 3143 if not bcftools_bin_command: 3144 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3145 log.error(msg_err) 3146 raise ValueError(msg_err) 3147 3148 # Config - BCFTools databases folders 3149 databases_folders = set( 3150 self.get_config() 3151 .get("folders", {}) 3152 .get("databases", {}) 3153 .get("annotations", ["."]) 3154 + self.get_config() 3155 .get("folders", {}) 3156 .get("databases", {}) 3157 .get("bcftools", ["."]) 3158 ) 3159 log.debug("Databases annotations: " + str(databases_folders)) 3160 3161 # Param 3162 annotations = ( 3163 self.get_param() 3164 .get("annotation", {}) 3165 .get("snpsift", {}) 3166 .get("annotations", None) 3167 ) 3168 log.debug("Annotations: " + str(annotations)) 3169 3170 # Assembly 3171 assembly = self.get_param().get( 3172 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3173 ) 3174 3175 # Data 3176 table_variants = self.get_table_variants() 3177 3178 # Check if not empty 3179 log.debug("Check if not empty") 3180 sql_query_chromosomes = ( 3181 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3182 ) 3183 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3184 if not sql_query_chromosomes_df["count"][0]: 3185 log.info(f"VCF empty") 3186 return 3187 3188 # VCF header 3189 vcf_reader = self.get_header() 3190 log.debug("Initial header: " + str(vcf_reader.infos)) 3191 3192 # Existing annotations 3193 for vcf_annotation in self.get_header().infos: 3194 3195 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3196 log.debug( 3197 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3198 ) 3199 3200 if annotations: 3201 3202 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3203 3204 # Export VCF file 3205 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3206 3207 # Init 3208 commands = {} 3209 3210 for annotation in annotations: 3211 annotation_fields = annotations[annotation] 3212 3213 # Annotation Name 3214 annotation_name = os.path.basename(annotation) 3215 3216 if not annotation_fields: 3217 annotation_fields = {"INFO": None} 3218 3219 log.debug(f"Annotation '{annotation_name}'") 3220 log.debug( 3221 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3222 ) 3223 3224 # Create Database 3225 database = Database( 3226 database=annotation, 3227 databases_folders=databases_folders, 3228 assembly=assembly, 3229 ) 3230 3231 # Find files 3232 db_file = database.get_database() 3233 db_file = full_path(db_file) 3234 db_hdr_file = database.get_header_file() 3235 db_hdr_file = full_path(db_hdr_file) 3236 db_file_type = database.get_format() 3237 db_tbi_file = f"{db_file}.tbi" 3238 db_file_compressed = database.is_compressed() 3239 3240 # Check if compressed 3241 if not db_file_compressed: 3242 log.error( 3243 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3244 ) 3245 raise ValueError( 3246 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3247 ) 3248 3249 # Check if indexed 3250 if not os.path.exists(db_tbi_file): 3251 log.error( 3252 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3253 ) 3254 raise ValueError( 3255 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3256 ) 3257 3258 # Check index - try to create if not exists 3259 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3260 log.error("Annotation failed: database not valid") 3261 log.error(f"Annotation annotation file: {db_file}") 3262 log.error(f"Annotation annotation header: {db_hdr_file}") 3263 log.error(f"Annotation annotation index: {db_tbi_file}") 3264 raise ValueError( 3265 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3266 ) 3267 else: 3268 3269 log.debug( 3270 f"Annotation '{annotation}' - file: " 3271 + str(db_file) 3272 + " and " 3273 + str(db_hdr_file) 3274 ) 3275 3276 # Load header as VCF object 3277 db_hdr_vcf = Variants(input=db_hdr_file) 3278 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3279 log.debug( 3280 "Annotation database header: " 3281 + str(db_hdr_vcf_header_infos) 3282 ) 3283 3284 # For all fields in database 3285 annotation_fields_full = False 3286 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3287 annotation_fields = { 3288 key: key for key in db_hdr_vcf_header_infos 3289 } 3290 log.debug( 3291 "Annotation database header - All annotations added: " 3292 + str(annotation_fields) 3293 ) 3294 annotation_fields_full = True 3295 3296 # # Create file for field rename 3297 # log.debug("Create file for field rename") 3298 # tmp_rename = NamedTemporaryFile( 3299 # prefix=self.get_prefix(), 3300 # dir=self.get_tmp_dir(), 3301 # suffix=".rename", 3302 # delete=False, 3303 # ) 3304 # tmp_rename_name = tmp_rename.name 3305 # tmp_files.append(tmp_rename_name) 3306 3307 # Number of fields 3308 nb_annotation_field = 0 3309 annotation_list = [] 3310 annotation_infos_rename_list = [] 3311 3312 for annotation_field in annotation_fields: 3313 3314 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3315 annotation_fields_new_name = annotation_fields.get( 3316 annotation_field, annotation_field 3317 ) 3318 if not annotation_fields_new_name: 3319 annotation_fields_new_name = annotation_field 3320 3321 # Check if field is in DB and if field is not elready in input data 3322 if ( 3323 annotation_field in db_hdr_vcf.get_header().infos 3324 and annotation_fields_new_name 3325 not in self.get_header().infos 3326 ): 3327 3328 log.info( 3329 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3330 ) 3331 3332 # BCFTools annotate param to rename fields 3333 if annotation_field != annotation_fields_new_name: 3334 annotation_infos_rename_list.append( 3335 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3336 ) 3337 3338 # Add INFO field to header 3339 db_hdr_vcf_header_infos_number = ( 3340 db_hdr_vcf_header_infos[annotation_field].num or "." 3341 ) 3342 db_hdr_vcf_header_infos_type = ( 3343 db_hdr_vcf_header_infos[annotation_field].type 3344 or "String" 3345 ) 3346 db_hdr_vcf_header_infos_description = ( 3347 db_hdr_vcf_header_infos[annotation_field].desc 3348 or f"{annotation_field} description" 3349 ) 3350 db_hdr_vcf_header_infos_source = ( 3351 db_hdr_vcf_header_infos[annotation_field].source 3352 or "unknown" 3353 ) 3354 db_hdr_vcf_header_infos_version = ( 3355 db_hdr_vcf_header_infos[annotation_field].version 3356 or "unknown" 3357 ) 3358 3359 vcf_reader.infos[annotation_fields_new_name] = ( 3360 vcf.parser._Info( 3361 annotation_fields_new_name, 3362 db_hdr_vcf_header_infos_number, 3363 db_hdr_vcf_header_infos_type, 3364 db_hdr_vcf_header_infos_description, 3365 db_hdr_vcf_header_infos_source, 3366 db_hdr_vcf_header_infos_version, 3367 self.code_type_map[ 3368 db_hdr_vcf_header_infos_type 3369 ], 3370 ) 3371 ) 3372 3373 annotation_list.append(annotation_field) 3374 3375 nb_annotation_field += 1 3376 3377 else: 3378 3379 if ( 3380 annotation_field 3381 not in db_hdr_vcf.get_header().infos 3382 ): 3383 log.warning( 3384 f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file" 3385 ) 3386 if ( 3387 annotation_fields_new_name 3388 in self.get_header().infos 3389 ): 3390 log.warning( 3391 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)" 3392 ) 3393 3394 log.info( 3395 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3396 ) 3397 3398 annotation_infos = ",".join(annotation_list) 3399 3400 if annotation_infos != "": 3401 3402 # Annotated VCF (and error file) 3403 tmp_annotation_vcf_name = os.path.join( 3404 tmp_dir, os.path.basename(annotation) + ".vcf.gz" 3405 ) 3406 tmp_annotation_vcf_name_err = ( 3407 tmp_annotation_vcf_name + ".err" 3408 ) 3409 3410 # Add fields to annotate 3411 if not annotation_fields_full: 3412 annotation_infos_option = f"-info {annotation_infos}" 3413 else: 3414 annotation_infos_option = "" 3415 3416 # Info fields rename 3417 if annotation_infos_rename_list: 3418 annotation_infos_rename = " -c " + ",".join( 3419 annotation_infos_rename_list 3420 ) 3421 else: 3422 annotation_infos_rename = "" 3423 3424 # Annotate command 3425 command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3426 3427 # Add command 3428 commands[command_annotate] = tmp_annotation_vcf_name 3429 3430 if commands: 3431 3432 # Export VCF file 3433 self.export_variant_vcf( 3434 vcf_file=tmp_vcf_name, 3435 remove_info=True, 3436 add_samples=False, 3437 index=True, 3438 ) 3439 shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf") 3440 3441 # Num command 3442 nb_command = 0 3443 3444 # Annotate 3445 for command_annotate in commands: 3446 nb_command += 1 3447 log.info( 3448 f"Annotation - Annotate [{nb_command}/{len(commands)}]..." 3449 ) 3450 log.debug(f"command_annotate={command_annotate}") 3451 run_parallel_commands([command_annotate], threads) 3452 3453 # Debug 3454 shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf") 3455 3456 # Update variants 3457 log.info( 3458 f"Annotation - Updating [{nb_command}/{len(commands)}]..." 3459 ) 3460 self.update_from_vcf(commands[command_annotate]) 3461 3462 def annotation_bcftools(self, threads: int = None) -> None: 3463 """ 3464 This function annotate with bcftools 3465 3466 :param threads: Number of threads to use 3467 :return: the value of the variable "return_value". 3468 """ 3469 3470 # DEBUG 3471 log.debug("Start annotation with bcftools databases") 3472 3473 # Threads 3474 if not threads: 3475 threads = self.get_threads() 3476 log.debug("Threads: " + str(threads)) 3477 3478 # Config 3479 config = self.get_config() 3480 log.debug("Config: " + str(config)) 3481 3482 # DEBUG 3483 delete_tmp = True 3484 if self.get_config().get("verbosity", "warning") in ["debug"]: 3485 delete_tmp = False 3486 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 3487 3488 # Config - BCFTools bin command 3489 bcftools_bin_command = get_bin_command( 3490 bin="bcftools", 3491 tool="bcftools", 3492 bin_type="bin", 3493 config=config, 3494 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3495 ) 3496 if not bcftools_bin_command: 3497 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3498 log.error(msg_err) 3499 raise ValueError(msg_err) 3500 3501 # Config - BCFTools databases folders 3502 databases_folders = set( 3503 self.get_config() 3504 .get("folders", {}) 3505 .get("databases", {}) 3506 .get("annotations", ["."]) 3507 + self.get_config() 3508 .get("folders", {}) 3509 .get("databases", {}) 3510 .get("bcftools", ["."]) 3511 ) 3512 log.debug("Databases annotations: " + str(databases_folders)) 3513 3514 # Param 3515 annotations = ( 3516 self.get_param() 3517 .get("annotation", {}) 3518 .get("bcftools", {}) 3519 .get("annotations", None) 3520 ) 3521 log.debug("Annotations: " + str(annotations)) 3522 3523 # Assembly 3524 assembly = self.get_param().get( 3525 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3526 ) 3527 3528 # Data 3529 table_variants = self.get_table_variants() 3530 3531 # Check if not empty 3532 log.debug("Check if not empty") 3533 sql_query_chromosomes = ( 3534 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3535 ) 3536 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3537 if not sql_query_chromosomes_df["count"][0]: 3538 log.info(f"VCF empty") 3539 return 3540 3541 # Export in VCF 3542 log.debug("Create initial file to annotate") 3543 tmp_vcf = NamedTemporaryFile( 3544 prefix=self.get_prefix(), 3545 dir=self.get_tmp_dir(), 3546 suffix=".vcf.gz", 3547 delete=False, 3548 ) 3549 tmp_vcf_name = tmp_vcf.name 3550 3551 # VCF header 3552 vcf_reader = self.get_header() 3553 log.debug("Initial header: " + str(vcf_reader.infos)) 3554 3555 # Existing annotations 3556 for vcf_annotation in self.get_header().infos: 3557 3558 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3559 log.debug( 3560 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3561 ) 3562 3563 if annotations: 3564 3565 tmp_ann_vcf_list = [] 3566 commands = [] 3567 tmp_files = [] 3568 err_files = [] 3569 3570 for annotation in annotations: 3571 annotation_fields = annotations[annotation] 3572 3573 # Annotation Name 3574 annotation_name = os.path.basename(annotation) 3575 3576 if not annotation_fields: 3577 annotation_fields = {"INFO": None} 3578 3579 log.debug(f"Annotation '{annotation_name}'") 3580 log.debug( 3581 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3582 ) 3583 3584 # Create Database 3585 database = Database( 3586 database=annotation, 3587 databases_folders=databases_folders, 3588 assembly=assembly, 3589 ) 3590 3591 # Find files 3592 db_file = database.get_database() 3593 db_file = full_path(db_file) 3594 db_hdr_file = database.get_header_file() 3595 db_hdr_file = full_path(db_hdr_file) 3596 db_file_type = database.get_format() 3597 db_tbi_file = f"{db_file}.tbi" 3598 db_file_compressed = database.is_compressed() 3599 3600 # Check if compressed 3601 if not db_file_compressed: 3602 log.error( 3603 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3604 ) 3605 raise ValueError( 3606 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3607 ) 3608 3609 # Check if indexed 3610 if not os.path.exists(db_tbi_file): 3611 log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file") 3612 raise ValueError( 3613 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3614 ) 3615 3616 # Check index - try to create if not exists 3617 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3618 log.error("Annotation failed: database not valid") 3619 log.error(f"Annotation annotation file: {db_file}") 3620 log.error(f"Annotation annotation header: {db_hdr_file}") 3621 log.error(f"Annotation annotation index: {db_tbi_file}") 3622 raise ValueError( 3623 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3624 ) 3625 else: 3626 3627 log.debug( 3628 f"Annotation '{annotation}' - file: " 3629 + str(db_file) 3630 + " and " 3631 + str(db_hdr_file) 3632 ) 3633 3634 # Load header as VCF object 3635 db_hdr_vcf = Variants(input=db_hdr_file) 3636 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3637 log.debug( 3638 "Annotation database header: " + str(db_hdr_vcf_header_infos) 3639 ) 3640 3641 # For all fields in database 3642 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3643 annotation_fields = { 3644 key: key for key in db_hdr_vcf_header_infos 3645 } 3646 log.debug( 3647 "Annotation database header - All annotations added: " 3648 + str(annotation_fields) 3649 ) 3650 3651 # Number of fields 3652 nb_annotation_field = 0 3653 annotation_list = [] 3654 3655 for annotation_field in annotation_fields: 3656 3657 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3658 annotation_fields_new_name = annotation_fields.get( 3659 annotation_field, annotation_field 3660 ) 3661 if not annotation_fields_new_name: 3662 annotation_fields_new_name = annotation_field 3663 3664 # Check if field is in DB and if field is not elready in input data 3665 if ( 3666 annotation_field in db_hdr_vcf.get_header().infos 3667 and annotation_fields_new_name 3668 not in self.get_header().infos 3669 ): 3670 3671 log.info( 3672 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3673 ) 3674 3675 # Add INFO field to header 3676 db_hdr_vcf_header_infos_number = ( 3677 db_hdr_vcf_header_infos[annotation_field].num or "." 3678 ) 3679 db_hdr_vcf_header_infos_type = ( 3680 db_hdr_vcf_header_infos[annotation_field].type 3681 or "String" 3682 ) 3683 db_hdr_vcf_header_infos_description = ( 3684 db_hdr_vcf_header_infos[annotation_field].desc 3685 or f"{annotation_field} description" 3686 ) 3687 db_hdr_vcf_header_infos_source = ( 3688 db_hdr_vcf_header_infos[annotation_field].source 3689 or "unknown" 3690 ) 3691 db_hdr_vcf_header_infos_version = ( 3692 db_hdr_vcf_header_infos[annotation_field].version 3693 or "unknown" 3694 ) 3695 3696 vcf_reader.infos[annotation_fields_new_name] = ( 3697 vcf.parser._Info( 3698 annotation_fields_new_name, 3699 db_hdr_vcf_header_infos_number, 3700 db_hdr_vcf_header_infos_type, 3701 db_hdr_vcf_header_infos_description, 3702 db_hdr_vcf_header_infos_source, 3703 db_hdr_vcf_header_infos_version, 3704 self.code_type_map[db_hdr_vcf_header_infos_type], 3705 ) 3706 ) 3707 3708 # annotation_list.append(annotation_field) 3709 if annotation_field != annotation_fields_new_name: 3710 annotation_list.append( 3711 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3712 ) 3713 else: 3714 annotation_list.append(annotation_field) 3715 3716 nb_annotation_field += 1 3717 3718 else: 3719 3720 if annotation_field not in db_hdr_vcf.get_header().infos: 3721 log.warning( 3722 f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file" 3723 ) 3724 if annotation_fields_new_name in self.get_header().infos: 3725 log.warning( 3726 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 3727 ) 3728 3729 log.info( 3730 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3731 ) 3732 3733 annotation_infos = ",".join(annotation_list) 3734 3735 if annotation_infos != "": 3736 3737 # Protect header for bcftools (remove "#CHROM" and variants line) 3738 log.debug("Protect Header file - remove #CHROM line if exists") 3739 tmp_header_vcf = NamedTemporaryFile( 3740 prefix=self.get_prefix(), 3741 dir=self.get_tmp_dir(), 3742 suffix=".hdr", 3743 delete=False, 3744 ) 3745 tmp_header_vcf_name = tmp_header_vcf.name 3746 tmp_files.append(tmp_header_vcf_name) 3747 # Command 3748 if db_hdr_file.endswith(".gz"): 3749 command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 3750 else: 3751 command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 3752 # Run 3753 run_parallel_commands([command_extract_header], 1) 3754 3755 # Find chomosomes 3756 log.debug("Find chromosomes ") 3757 sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\"""" 3758 sql_query_chromosomes_df = self.get_query_to_df( 3759 sql_query_chromosomes 3760 ) 3761 chomosomes_list = list(sql_query_chromosomes_df["CHROM"]) 3762 3763 log.debug("Chromosomes found: " + str(list(chomosomes_list))) 3764 3765 # BED columns in the annotation file 3766 if db_file_type in ["bed"]: 3767 annotation_infos = "CHROM,POS,POS," + annotation_infos 3768 3769 for chrom in chomosomes_list: 3770 3771 # Create BED on initial VCF 3772 log.debug("Create BED on initial VCF: " + str(tmp_vcf_name)) 3773 tmp_bed = NamedTemporaryFile( 3774 prefix=self.get_prefix(), 3775 dir=self.get_tmp_dir(), 3776 suffix=".bed", 3777 delete=False, 3778 ) 3779 tmp_bed_name = tmp_bed.name 3780 tmp_files.append(tmp_bed_name) 3781 3782 # Detecte regions 3783 log.debug( 3784 f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..." 3785 ) 3786 window = 1000000 3787 sql_query_intervals_for_bed = f""" 3788 SELECT \"#CHROM\", 3789 CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END, 3790 \"POS\"+{window} 3791 FROM {table_variants} as table_variants 3792 WHERE table_variants.\"#CHROM\" = '{chrom}' 3793 """ 3794 regions = self.conn.execute( 3795 sql_query_intervals_for_bed 3796 ).fetchall() 3797 merged_regions = merge_regions(regions) 3798 log.debug( 3799 f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..." 3800 ) 3801 3802 header = ["#CHROM", "START", "END"] 3803 with open(tmp_bed_name, "w") as f: 3804 # Write the header with tab delimiter 3805 f.write("\t".join(header) + "\n") 3806 for d in merged_regions: 3807 # Write each data row with tab delimiter 3808 f.write("\t".join(map(str, d)) + "\n") 3809 3810 # Tmp files 3811 tmp_annotation_vcf = NamedTemporaryFile( 3812 prefix=self.get_prefix(), 3813 dir=self.get_tmp_dir(), 3814 suffix=".vcf.gz", 3815 delete=False, 3816 ) 3817 tmp_annotation_vcf_name = tmp_annotation_vcf.name 3818 tmp_files.append(tmp_annotation_vcf_name) 3819 tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}") 3820 tmp_annotation_vcf_name_err = ( 3821 tmp_annotation_vcf_name + ".err" 3822 ) 3823 err_files.append(tmp_annotation_vcf_name_err) 3824 3825 # Annotate Command 3826 log.debug( 3827 f"Annotation '{annotation}' - add bcftools command" 3828 ) 3829 3830 # Command 3831 command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3832 3833 # Add command 3834 commands.append(command_annotate) 3835 3836 # if some commands 3837 if commands: 3838 3839 # Export VCF file 3840 self.export_variant_vcf( 3841 vcf_file=tmp_vcf_name, 3842 remove_info=True, 3843 add_samples=False, 3844 index=True, 3845 ) 3846 3847 # Threads 3848 # calculate threads for annotated commands 3849 if commands: 3850 threads_bcftools_annotate = round(threads / len(commands)) 3851 else: 3852 threads_bcftools_annotate = 1 3853 3854 if not threads_bcftools_annotate: 3855 threads_bcftools_annotate = 1 3856 3857 # Add threads option to bcftools commands 3858 if threads_bcftools_annotate > 1: 3859 commands_threaded = [] 3860 for command in commands: 3861 commands_threaded.append( 3862 command.replace( 3863 f"{bcftools_bin_command} annotate ", 3864 f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ", 3865 ) 3866 ) 3867 commands = commands_threaded 3868 3869 # Command annotation multithreading 3870 log.debug(f"Annotation - Annotation commands: " + str(commands)) 3871 log.info( 3872 f"Annotation - Annotation multithreaded in " 3873 + str(len(commands)) 3874 + " commands" 3875 ) 3876 3877 run_parallel_commands(commands, threads) 3878 3879 # Merge 3880 tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list) 3881 3882 if tmp_ann_vcf_list_cmd: 3883 3884 # Tmp file 3885 tmp_annotate_vcf = NamedTemporaryFile( 3886 prefix=self.get_prefix(), 3887 dir=self.get_tmp_dir(), 3888 suffix=".vcf.gz", 3889 delete=True, 3890 ) 3891 tmp_annotate_vcf_name = tmp_annotate_vcf.name 3892 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 3893 err_files.append(tmp_annotate_vcf_name_err) 3894 3895 # Tmp file remove command 3896 tmp_files_remove_command = "" 3897 if tmp_files: 3898 tmp_files_remove_command = " && rm -f " + " ".join(tmp_files) 3899 3900 # Command merge 3901 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}" 3902 log.info( 3903 f"Annotation - Annotation merging " 3904 + str(len(commands)) 3905 + " annotated files" 3906 ) 3907 log.debug(f"Annotation - merge command: {merge_command}") 3908 run_parallel_commands([merge_command], 1) 3909 3910 # Error messages 3911 log.info(f"Error/Warning messages:") 3912 error_message_command_all = [] 3913 error_message_command_warning = [] 3914 error_message_command_err = [] 3915 for err_file in err_files: 3916 with open(err_file, "r") as f: 3917 for line in f: 3918 message = line.strip() 3919 error_message_command_all.append(message) 3920 if line.startswith("[W::"): 3921 error_message_command_warning.append(message) 3922 if line.startswith("[E::"): 3923 error_message_command_err.append( 3924 f"{err_file}: " + message 3925 ) 3926 # log info 3927 for message in list( 3928 set(error_message_command_err + error_message_command_warning) 3929 ): 3930 log.info(f" {message}") 3931 # debug info 3932 for message in list(set(error_message_command_all)): 3933 log.debug(f" {message}") 3934 # failed 3935 if len(error_message_command_err): 3936 log.error("Annotation failed: Error in commands") 3937 raise ValueError("Annotation failed: Error in commands") 3938 3939 # Update variants 3940 log.info(f"Annotation - Updating...") 3941 self.update_from_vcf(tmp_annotate_vcf_name) 3942 3943 def annotation_exomiser(self, threads: int = None) -> None: 3944 """ 3945 This function annotate with Exomiser 3946 3947 This function uses args as parameters, in section "annotation" -> "exomiser", with sections: 3948 - "analysis" (dict/file): 3949 Full analysis dictionnary parameters (see Exomiser docs). 3950 Either a dict, or a file in JSON or YAML format. 3951 These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) 3952 Default : None 3953 - "preset" (string): 3954 Analysis preset (available in config folder). 3955 Used if no full "analysis" is provided. 3956 Default: "exome" 3957 - "phenopacket" (dict/file): 3958 Samples and phenotipic features parameters (see Exomiser docs). 3959 Either a dict, or a file in JSON or YAML format. 3960 Default: None 3961 - "subject" (dict): 3962 Sample parameters (see Exomiser docs). 3963 Example: 3964 "subject": 3965 { 3966 "id": "ISDBM322017", 3967 "sex": "FEMALE" 3968 } 3969 Default: None 3970 - "sample" (string): 3971 Sample name to construct "subject" section: 3972 "subject": 3973 { 3974 "id": "<sample>", 3975 "sex": "UNKNOWN_SEX" 3976 } 3977 Default: None 3978 - "phenotypicFeatures" (dict) 3979 Phenotypic features to construct "subject" section. 3980 Example: 3981 "phenotypicFeatures": 3982 [ 3983 { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, 3984 { "type": { "id": "HP:0000486", "label": "Strabismus" } } 3985 ] 3986 - "hpo" (list) 3987 List of HPO ids as phenotypic features. 3988 Example: 3989 "hpo": ['0001156', '0001363', '0011304', '0010055'] 3990 Default: [] 3991 - "outputOptions" (dict): 3992 Output options (see Exomiser docs). 3993 Default: 3994 "output_options" = 3995 { 3996 "outputContributingVariantsOnly": False, 3997 "numGenes": 0, 3998 "outputFormats": ["TSV_VARIANT", "VCF"] 3999 } 4000 - "transcript_source" (string): 4001 Transcript source (either "refseq", "ucsc", "ensembl") 4002 Default: "refseq" 4003 - "exomiser_to_info" (boolean): 4004 Add exomiser TSV file columns as INFO fields in VCF. 4005 Default: False 4006 - "release" (string): 4007 Exomise database release. 4008 If not exists, database release will be downloaded (take a while). 4009 Default: None (provided by application.properties configuration file) 4010 - "exomiser_application_properties" (file): 4011 Exomiser configuration file (see Exomiser docs). 4012 Useful to automatically download databases (especially for specific genome databases). 4013 4014 Notes: 4015 - If no sample in parameters, first sample in VCF will be chosen 4016 - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off 4017 4018 :param threads: The number of threads to use 4019 :return: None. 4020 """ 4021 4022 # DEBUG 4023 log.debug("Start annotation with Exomiser databases") 4024 4025 # Threads 4026 if not threads: 4027 threads = self.get_threads() 4028 log.debug("Threads: " + str(threads)) 4029 4030 # Config 4031 config = self.get_config() 4032 log.debug("Config: " + str(config)) 4033 4034 # Config - Folders - Databases 4035 databases_folders = ( 4036 config.get("folders", {}) 4037 .get("databases", {}) 4038 .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current") 4039 ) 4040 databases_folders = full_path(databases_folders) 4041 if not os.path.exists(databases_folders): 4042 log.error(f"Databases annotations: {databases_folders} NOT found") 4043 log.debug("Databases annotations: " + str(databases_folders)) 4044 4045 # Config - Exomiser 4046 exomiser_bin_command = get_bin_command( 4047 bin="exomiser-cli*.jar", 4048 tool="exomiser", 4049 bin_type="jar", 4050 config=config, 4051 default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser", 4052 ) 4053 log.debug("Exomiser bin command: " + str(exomiser_bin_command)) 4054 if not exomiser_bin_command: 4055 msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'" 4056 log.error(msg_err) 4057 raise ValueError(msg_err) 4058 4059 # Param 4060 param = self.get_param() 4061 log.debug("Param: " + str(param)) 4062 4063 # Param - Exomiser 4064 param_exomiser = param.get("annotation", {}).get("exomiser", {}) 4065 log.debug(f"Param Exomiser: {param_exomiser}") 4066 4067 # Param - Assembly 4068 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4069 log.debug("Assembly: " + str(assembly)) 4070 4071 # Data 4072 table_variants = self.get_table_variants() 4073 4074 # Check if not empty 4075 log.debug("Check if not empty") 4076 sql_query_chromosomes = ( 4077 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4078 ) 4079 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4080 log.info(f"VCF empty") 4081 return False 4082 4083 # VCF header 4084 vcf_reader = self.get_header() 4085 log.debug("Initial header: " + str(vcf_reader.infos)) 4086 4087 # Samples 4088 samples = self.get_header_sample_list() 4089 if not samples: 4090 log.error("No Samples in VCF") 4091 return False 4092 log.debug(f"Samples: {samples}") 4093 4094 # Memory limit 4095 memory_limit = self.get_memory("8G") 4096 log.debug(f"memory_limit: {memory_limit}") 4097 4098 # Exomiser java options 4099 exomiser_java_options = ( 4100 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4101 ) 4102 log.debug(f"Exomiser java options: {exomiser_java_options}") 4103 4104 # Download Exomiser (if not exists) 4105 exomiser_release = param_exomiser.get("release", None) 4106 exomiser_application_properties = param_exomiser.get( 4107 "exomiser_application_properties", None 4108 ) 4109 databases_download_exomiser( 4110 assemblies=[assembly], 4111 exomiser_folder=databases_folders, 4112 exomiser_release=exomiser_release, 4113 exomiser_phenotype_release=exomiser_release, 4114 exomiser_application_properties=exomiser_application_properties, 4115 ) 4116 4117 # Force annotation 4118 force_update_annotation = True 4119 4120 if "Exomiser" not in self.get_header().infos or force_update_annotation: 4121 log.debug("Start annotation Exomiser") 4122 4123 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 4124 4125 # tmp_dir = "/tmp/exomiser" 4126 4127 ### ANALYSIS ### 4128 ################ 4129 4130 # Create analysis.json through analysis dict 4131 # either analysis in param or by default 4132 # depending on preset exome/genome) 4133 4134 # Init analysis dict 4135 param_exomiser_analysis_dict = {} 4136 4137 # analysis from param 4138 param_exomiser_analysis = param_exomiser.get("analysis", {}) 4139 param_exomiser_analysis = full_path(param_exomiser_analysis) 4140 4141 # If analysis in param -> load anlaysis json 4142 if param_exomiser_analysis: 4143 4144 # If param analysis is a file and exists 4145 if isinstance(param_exomiser_analysis, str) and os.path.exists( 4146 param_exomiser_analysis 4147 ): 4148 # Load analysis file into analysis dict (either yaml or json) 4149 with open(param_exomiser_analysis) as json_file: 4150 param_exomiser_analysis_dict = yaml.safe_load(json_file) 4151 4152 # If param analysis is a dict 4153 elif isinstance(param_exomiser_analysis, dict): 4154 # Load analysis dict into analysis dict (either yaml or json) 4155 param_exomiser_analysis_dict = param_exomiser_analysis 4156 4157 # Error analysis type 4158 else: 4159 log.error(f"Analysis type unknown. Check param file.") 4160 raise ValueError(f"Analysis type unknown. Check param file.") 4161 4162 # Case no input analysis config file/dict 4163 # Use preset (exome/genome) to open default config file 4164 if not param_exomiser_analysis_dict: 4165 4166 # default preset 4167 default_preset = "exome" 4168 4169 # Get param preset or default preset 4170 param_exomiser_preset = param_exomiser.get("preset", default_preset) 4171 4172 # Try to find if preset is a file 4173 if os.path.exists(param_exomiser_preset): 4174 # Preset file is provided in full path 4175 param_exomiser_analysis_default_config_file = ( 4176 param_exomiser_preset 4177 ) 4178 # elif os.path.exists(full_path(param_exomiser_preset)): 4179 # # Preset file is provided in full path 4180 # param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset) 4181 elif os.path.exists( 4182 os.path.join(folder_config, param_exomiser_preset) 4183 ): 4184 # Preset file is provided a basename in config folder (can be a path with subfolders) 4185 param_exomiser_analysis_default_config_file = os.path.join( 4186 folder_config, param_exomiser_preset 4187 ) 4188 else: 4189 # Construct preset file 4190 param_exomiser_analysis_default_config_file = os.path.join( 4191 folder_config, 4192 f"preset-{param_exomiser_preset}-analysis.json", 4193 ) 4194 4195 # If preset file exists 4196 param_exomiser_analysis_default_config_file = full_path( 4197 param_exomiser_analysis_default_config_file 4198 ) 4199 if os.path.exists(param_exomiser_analysis_default_config_file): 4200 # Load prest file into analysis dict (either yaml or json) 4201 with open( 4202 param_exomiser_analysis_default_config_file 4203 ) as json_file: 4204 # param_exomiser_analysis_dict[""] = json.load(json_file) 4205 param_exomiser_analysis_dict["analysis"] = yaml.safe_load( 4206 json_file 4207 ) 4208 4209 # Error preset file 4210 else: 4211 log.error( 4212 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4213 ) 4214 raise ValueError( 4215 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4216 ) 4217 4218 # If no analysis dict created 4219 if not param_exomiser_analysis_dict: 4220 log.error(f"No analysis config") 4221 raise ValueError(f"No analysis config") 4222 4223 # Log 4224 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4225 4226 ### PHENOPACKET ### 4227 ################### 4228 4229 # If no PhenoPacket in analysis dict -> check in param 4230 if "phenopacket" not in param_exomiser_analysis_dict: 4231 4232 # If PhenoPacket in param -> load anlaysis json 4233 if param_exomiser.get("phenopacket", None): 4234 4235 param_exomiser_phenopacket = param_exomiser.get("phenopacket") 4236 param_exomiser_phenopacket = full_path( 4237 param_exomiser_phenopacket 4238 ) 4239 4240 # If param phenopacket is a file and exists 4241 if isinstance( 4242 param_exomiser_phenopacket, str 4243 ) and os.path.exists(param_exomiser_phenopacket): 4244 # Load phenopacket file into analysis dict (either yaml or json) 4245 with open(param_exomiser_phenopacket) as json_file: 4246 param_exomiser_analysis_dict["phenopacket"] = ( 4247 yaml.safe_load(json_file) 4248 ) 4249 4250 # If param phenopacket is a dict 4251 elif isinstance(param_exomiser_phenopacket, dict): 4252 # Load phenopacket dict into analysis dict (either yaml or json) 4253 param_exomiser_analysis_dict["phenopacket"] = ( 4254 param_exomiser_phenopacket 4255 ) 4256 4257 # Error phenopacket type 4258 else: 4259 log.error(f"Phenopacket type unknown. Check param file.") 4260 raise ValueError( 4261 f"Phenopacket type unknown. Check param file." 4262 ) 4263 4264 # If no PhenoPacket in analysis dict -> construct from sample and HPO in param 4265 if "phenopacket" not in param_exomiser_analysis_dict: 4266 4267 # Init PhenoPacket 4268 param_exomiser_analysis_dict["phenopacket"] = { 4269 "id": "analysis", 4270 "proband": {}, 4271 } 4272 4273 ### Add subject ### 4274 4275 # If subject exists 4276 param_exomiser_subject = param_exomiser.get("subject", {}) 4277 4278 # If subject not exists -> found sample ID 4279 if not param_exomiser_subject: 4280 4281 # Found sample ID in param 4282 sample = param_exomiser.get("sample", None) 4283 4284 # Find sample ID (first sample) 4285 if not sample: 4286 sample_list = self.get_header_sample_list() 4287 if len(sample_list) > 0: 4288 sample = sample_list[0] 4289 else: 4290 log.error(f"No sample found") 4291 raise ValueError(f"No sample found") 4292 4293 # Create subject 4294 param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"} 4295 4296 # Add to dict 4297 param_exomiser_analysis_dict["phenopacket"][ 4298 "subject" 4299 ] = param_exomiser_subject 4300 4301 ### Add "phenotypicFeatures" ### 4302 4303 # If phenotypicFeatures exists 4304 param_exomiser_phenotypicfeatures = param_exomiser.get( 4305 "phenotypicFeatures", [] 4306 ) 4307 4308 # If phenotypicFeatures not exists -> Try to infer from hpo list 4309 if not param_exomiser_phenotypicfeatures: 4310 4311 # Found HPO in param 4312 param_exomiser_hpo = param_exomiser.get("hpo", []) 4313 4314 # Split HPO if list in string format separated by comma 4315 if isinstance(param_exomiser_hpo, str): 4316 param_exomiser_hpo = param_exomiser_hpo.split(",") 4317 4318 # Create HPO list 4319 for hpo in param_exomiser_hpo: 4320 hpo_clean = re.sub("[^0-9]", "", hpo) 4321 param_exomiser_phenotypicfeatures.append( 4322 { 4323 "type": { 4324 "id": f"HP:{hpo_clean}", 4325 "label": f"HP:{hpo_clean}", 4326 } 4327 } 4328 ) 4329 4330 # Add to dict 4331 param_exomiser_analysis_dict["phenopacket"][ 4332 "phenotypicFeatures" 4333 ] = param_exomiser_phenotypicfeatures 4334 4335 # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step 4336 if not param_exomiser_phenotypicfeatures: 4337 for step in param_exomiser_analysis_dict.get( 4338 "analysis", {} 4339 ).get("steps", []): 4340 if "hiPhivePrioritiser" in step: 4341 param_exomiser_analysis_dict.get("analysis", {}).get( 4342 "steps", [] 4343 ).remove(step) 4344 4345 ### Add Input File ### 4346 4347 # Initial file name and htsFiles 4348 tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz") 4349 param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [ 4350 { 4351 "uri": tmp_vcf_name, 4352 "htsFormat": "VCF", 4353 "genomeAssembly": assembly, 4354 } 4355 ] 4356 4357 ### Add metaData ### 4358 4359 # If metaData not in analysis dict 4360 if "metaData" not in param_exomiser_analysis_dict: 4361 param_exomiser_analysis_dict["phenopacket"]["metaData"] = { 4362 "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z", 4363 "createdBy": "howard", 4364 "phenopacketSchemaVersion": 1, 4365 } 4366 4367 ### OutputOptions ### 4368 4369 # Init output result folder 4370 output_results = os.path.join(tmp_dir, "results") 4371 4372 # If no outputOptions in analysis dict 4373 if "outputOptions" not in param_exomiser_analysis_dict: 4374 4375 # default output formats 4376 defaut_output_formats = ["TSV_VARIANT", "VCF"] 4377 4378 # Get outputOptions in param 4379 output_options = param_exomiser.get("outputOptions", None) 4380 4381 # If no output_options in param -> check 4382 if not output_options: 4383 output_options = { 4384 "outputContributingVariantsOnly": False, 4385 "numGenes": 0, 4386 "outputFormats": defaut_output_formats, 4387 } 4388 4389 # Replace outputDirectory in output options 4390 output_options["outputDirectory"] = output_results 4391 output_options["outputFileName"] = "howard" 4392 4393 # Add outputOptions in analysis dict 4394 param_exomiser_analysis_dict["outputOptions"] = output_options 4395 4396 else: 4397 4398 # Replace output_results and output format (if exists in param) 4399 param_exomiser_analysis_dict["outputOptions"][ 4400 "outputDirectory" 4401 ] = output_results 4402 param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = ( 4403 list( 4404 set( 4405 param_exomiser_analysis_dict.get( 4406 "outputOptions", {} 4407 ).get("outputFormats", []) 4408 + ["TSV_VARIANT", "VCF"] 4409 ) 4410 ) 4411 ) 4412 4413 # log 4414 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4415 4416 ### ANALYSIS FILE ### 4417 ##################### 4418 4419 ### Full JSON analysis config file ### 4420 4421 exomiser_analysis = os.path.join(tmp_dir, "analysis.json") 4422 with open(exomiser_analysis, "w") as fp: 4423 json.dump(param_exomiser_analysis_dict, fp, indent=4) 4424 4425 ### SPLIT analysis and sample config files 4426 4427 # Splitted analysis dict 4428 param_exomiser_analysis_dict_for_split = ( 4429 param_exomiser_analysis_dict.copy() 4430 ) 4431 4432 # Phenopacket JSON file 4433 exomiser_analysis_phenopacket = os.path.join( 4434 tmp_dir, "analysis_phenopacket.json" 4435 ) 4436 with open(exomiser_analysis_phenopacket, "w") as fp: 4437 json.dump( 4438 param_exomiser_analysis_dict_for_split.get("phenopacket"), 4439 fp, 4440 indent=4, 4441 ) 4442 4443 # Analysis JSON file without Phenopacket parameters 4444 param_exomiser_analysis_dict_for_split.pop("phenopacket") 4445 exomiser_analysis_analysis = os.path.join( 4446 tmp_dir, "analysis_analysis.json" 4447 ) 4448 with open(exomiser_analysis_analysis, "w") as fp: 4449 json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4) 4450 4451 ### INITAL VCF file ### 4452 ####################### 4453 4454 ### Create list of samples to use and include inti initial VCF file #### 4455 4456 # Subject (main sample) 4457 # Get sample ID in analysis dict 4458 sample_subject = ( 4459 param_exomiser_analysis_dict.get("phenopacket", {}) 4460 .get("subject", {}) 4461 .get("id", None) 4462 ) 4463 sample_proband = ( 4464 param_exomiser_analysis_dict.get("phenopacket", {}) 4465 .get("proband", {}) 4466 .get("subject", {}) 4467 .get("id", None) 4468 ) 4469 sample = [] 4470 if sample_subject: 4471 sample.append(sample_subject) 4472 if sample_proband: 4473 sample.append(sample_proband) 4474 4475 # Get sample ID within Pedigree 4476 pedigree_persons_list = ( 4477 param_exomiser_analysis_dict.get("phenopacket", {}) 4478 .get("pedigree", {}) 4479 .get("persons", {}) 4480 ) 4481 4482 # Create list with all sample ID in pedigree (if exists) 4483 pedigree_persons = [] 4484 for person in pedigree_persons_list: 4485 pedigree_persons.append(person.get("individualId")) 4486 4487 # Concat subject sample ID and samples ID in pedigreesamples 4488 samples = list(set(sample + pedigree_persons)) 4489 4490 # Check if sample list is not empty 4491 if not samples: 4492 log.error(f"No samples found") 4493 raise ValueError(f"No samples found") 4494 4495 # Create VCF with sample (either sample in param or first one by default) 4496 # Export VCF file 4497 self.export_variant_vcf( 4498 vcf_file=tmp_vcf_name, 4499 remove_info=True, 4500 add_samples=True, 4501 list_samples=samples, 4502 index=False, 4503 ) 4504 4505 ### Execute Exomiser ### 4506 ######################## 4507 4508 # Init command 4509 exomiser_command = "" 4510 4511 # Command exomiser options 4512 exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} " 4513 4514 # Release 4515 exomiser_release = param_exomiser.get("release", None) 4516 if exomiser_release: 4517 # phenotype data version 4518 exomiser_options += ( 4519 f" --exomiser.phenotype.data-version={exomiser_release} " 4520 ) 4521 # data version 4522 exomiser_options += ( 4523 f" --exomiser.{assembly}.data-version={exomiser_release} " 4524 ) 4525 # variant white list 4526 variant_white_list_file = ( 4527 f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz" 4528 ) 4529 if os.path.exists( 4530 os.path.join( 4531 databases_folders, assembly, variant_white_list_file 4532 ) 4533 ): 4534 exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} " 4535 4536 # transcript_source 4537 transcript_source = param_exomiser.get( 4538 "transcript_source", None 4539 ) # ucsc, refseq, ensembl 4540 if transcript_source: 4541 exomiser_options += ( 4542 f" --exomiser.{assembly}.transcript-source={transcript_source} " 4543 ) 4544 4545 # If analysis contain proband param 4546 if param_exomiser_analysis_dict.get("phenopacket", {}).get( 4547 "proband", {} 4548 ): 4549 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} " 4550 4551 # If no proband (usually uniq sample) 4552 else: 4553 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}" 4554 4555 # Log 4556 log.debug(f"exomiser_command_analysis={exomiser_command_analysis}") 4557 4558 # Run command 4559 result = subprocess.call( 4560 exomiser_command_analysis.split(), stdout=subprocess.PIPE 4561 ) 4562 if result: 4563 log.error("Exomiser command failed") 4564 raise ValueError("Exomiser command failed") 4565 4566 ### RESULTS ### 4567 ############### 4568 4569 ### Annotate with TSV fields ### 4570 4571 # Init result tsv file 4572 exomiser_to_info = param_exomiser.get("exomiser_to_info", False) 4573 4574 # Init result tsv file 4575 output_results_tsv = os.path.join(output_results, "howard.variants.tsv") 4576 4577 # Parse TSV file and explode columns in INFO field 4578 if exomiser_to_info and os.path.exists(output_results_tsv): 4579 4580 # Log 4581 log.debug("Exomiser columns to VCF INFO field") 4582 4583 # Retrieve columns and types 4584 query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """ 4585 output_results_tsv_df = self.get_query_to_df(query) 4586 output_results_tsv_columns = output_results_tsv_df.columns.tolist() 4587 4588 # Init concat fields for update 4589 sql_query_update_concat_fields = [] 4590 4591 # Fields to avoid 4592 fields_to_avoid = [ 4593 "CONTIG", 4594 "START", 4595 "END", 4596 "REF", 4597 "ALT", 4598 "QUAL", 4599 "FILTER", 4600 "GENOTYPE", 4601 ] 4602 4603 # List all columns to add into header 4604 for header_column in output_results_tsv_columns: 4605 4606 # If header column is enable 4607 if header_column not in fields_to_avoid: 4608 4609 # Header info type 4610 header_info_type = "String" 4611 header_column_df = output_results_tsv_df[header_column] 4612 header_column_df_dtype = header_column_df.dtype 4613 if header_column_df_dtype == object: 4614 if ( 4615 pd.to_numeric(header_column_df, errors="coerce") 4616 .notnull() 4617 .all() 4618 ): 4619 header_info_type = "Float" 4620 else: 4621 header_info_type = "Integer" 4622 4623 # Header info 4624 characters_to_validate = ["-"] 4625 pattern = "[" + "".join(characters_to_validate) + "]" 4626 header_info_name = re.sub( 4627 pattern, 4628 "_", 4629 f"Exomiser_{header_column}".replace("#", ""), 4630 ) 4631 header_info_number = "." 4632 header_info_description = ( 4633 f"Exomiser {header_column} annotation" 4634 ) 4635 header_info_source = "Exomiser" 4636 header_info_version = "unknown" 4637 header_info_code = CODE_TYPE_MAP[header_info_type] 4638 vcf_reader.infos[header_info_name] = vcf.parser._Info( 4639 header_info_name, 4640 header_info_number, 4641 header_info_type, 4642 header_info_description, 4643 header_info_source, 4644 header_info_version, 4645 header_info_code, 4646 ) 4647 4648 # Add field to add for update to concat fields 4649 sql_query_update_concat_fields.append( 4650 f""" 4651 CASE 4652 WHEN table_parquet."{header_column}" NOT IN ('','.') 4653 THEN concat( 4654 '{header_info_name}=', 4655 table_parquet."{header_column}", 4656 ';' 4657 ) 4658 4659 ELSE '' 4660 END 4661 """ 4662 ) 4663 4664 # Update query 4665 sql_query_update = f""" 4666 UPDATE {table_variants} as table_variants 4667 SET INFO = concat( 4668 CASE 4669 WHEN INFO NOT IN ('', '.') 4670 THEN INFO 4671 ELSE '' 4672 END, 4673 CASE 4674 WHEN table_variants.INFO NOT IN ('','.') 4675 THEN ';' 4676 ELSE '' 4677 END, 4678 ( 4679 SELECT 4680 concat( 4681 {",".join(sql_query_update_concat_fields)} 4682 ) 4683 FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet 4684 WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\" 4685 AND table_parquet.\"START\" = table_variants.\"POS\" 4686 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 4687 AND table_parquet.\"REF\" = table_variants.\"REF\" 4688 ) 4689 ) 4690 ; 4691 """ 4692 4693 # Update 4694 self.conn.execute(sql_query_update) 4695 4696 ### Annotate with VCF INFO field ### 4697 4698 # Init result VCF file 4699 output_results_vcf = os.path.join(output_results, "howard.vcf.gz") 4700 4701 # If VCF exists 4702 if os.path.exists(output_results_vcf): 4703 4704 # Log 4705 log.debug("Exomiser result VCF update variants") 4706 4707 # Find Exomiser INFO field annotation in header 4708 with gzip.open(output_results_vcf, "rt") as f: 4709 header_list = self.read_vcf_header(f) 4710 exomiser_vcf_header = vcf.Reader( 4711 io.StringIO("\n".join(header_list)) 4712 ) 4713 4714 # Add annotation INFO field to header 4715 vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"] 4716 4717 # Update variants with VCF 4718 self.update_from_vcf(output_results_vcf) 4719 4720 return True 4721 4722 def annotation_snpeff(self, threads: int = None) -> None: 4723 """ 4724 This function annotate with snpEff 4725 4726 :param threads: The number of threads to use 4727 :return: the value of the variable "return_value". 4728 """ 4729 4730 # DEBUG 4731 log.debug("Start annotation with snpeff databases") 4732 4733 # Threads 4734 if not threads: 4735 threads = self.get_threads() 4736 log.debug("Threads: " + str(threads)) 4737 4738 # DEBUG 4739 delete_tmp = True 4740 if self.get_config().get("verbosity", "warning") in ["debug"]: 4741 delete_tmp = False 4742 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4743 4744 # Config 4745 config = self.get_config() 4746 log.debug("Config: " + str(config)) 4747 4748 # Config - Folders - Databases 4749 databases_folders = ( 4750 config.get("folders", {}).get("databases", {}).get("snpeff", ["."]) 4751 ) 4752 log.debug("Databases annotations: " + str(databases_folders)) 4753 4754 # # Config - Java 4755 # java_bin = get_bin( 4756 # tool="java", 4757 # bin="java", 4758 # bin_type="bin", 4759 # config=config, 4760 # default_folder="/usr/bin", 4761 # ) 4762 # if not (os.path.exists(java_bin) or (java_bin and which(java_bin))): 4763 # log.error(f"Annotation failed: no java bin '{java_bin}'") 4764 # raise ValueError(f"Annotation failed: no java bin '{java_bin}'") 4765 4766 # # Config - snpEff bin 4767 # snpeff_jar = get_bin( 4768 # tool="snpeff", 4769 # bin="snpEff.jar", 4770 # bin_type="jar", 4771 # config=config, 4772 # default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 4773 # ) 4774 # if not (os.path.exists(snpeff_jar) or (snpeff_jar and which(snpeff_jar))): 4775 # log.error(f"Annotation failed: no snpEff jar '{snpeff_jar}'") 4776 # raise ValueError(f"Annotation failed: no snpEff jar '{snpeff_jar}'") 4777 4778 # Config - snpEff bin command 4779 snpeff_bin_command = get_bin_command( 4780 bin="snpEff.jar", 4781 tool="snpeff", 4782 bin_type="jar", 4783 config=config, 4784 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 4785 ) 4786 if not snpeff_bin_command: 4787 msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'" 4788 log.error(msg_err) 4789 raise ValueError(msg_err) 4790 4791 # Config - snpEff databases 4792 snpeff_databases = ( 4793 config.get("folders", {}) 4794 .get("databases", {}) 4795 .get("snpeff", DEFAULT_SNPEFF_FOLDER) 4796 ) 4797 snpeff_databases = full_path(snpeff_databases) 4798 if snpeff_databases is not None and snpeff_databases != "": 4799 log.debug(f"Create snpEff databases folder") 4800 if not os.path.exists(snpeff_databases): 4801 os.makedirs(snpeff_databases) 4802 4803 # Param 4804 param = self.get_param() 4805 log.debug("Param: " + str(param)) 4806 4807 # Param 4808 options = param.get("annotation", {}).get("snpeff", {}).get("options", None) 4809 log.debug("Options: " + str(options)) 4810 4811 # Param - Assembly 4812 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4813 4814 # Param - Options 4815 snpeff_options = ( 4816 param.get("annotation", {}).get("snpeff", {}).get("options", "") 4817 ) 4818 snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None) 4819 snpeff_csvstats = ( 4820 param.get("annotation", {}).get("snpeff", {}).get("csvStats", None) 4821 ) 4822 if snpeff_stats: 4823 snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output()) 4824 snpeff_stats = full_path(snpeff_stats) 4825 snpeff_options += f" -stats {snpeff_stats}" 4826 if snpeff_csvstats: 4827 snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output()) 4828 snpeff_csvstats = full_path(snpeff_csvstats) 4829 snpeff_options += f" -csvStats {snpeff_csvstats}" 4830 4831 # Data 4832 table_variants = self.get_table_variants() 4833 4834 # Check if not empty 4835 log.debug("Check if not empty") 4836 sql_query_chromosomes = ( 4837 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4838 ) 4839 # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]: 4840 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4841 log.info(f"VCF empty") 4842 return 4843 4844 # Export in VCF 4845 log.debug("Create initial file to annotate") 4846 tmp_vcf = NamedTemporaryFile( 4847 prefix=self.get_prefix(), 4848 dir=self.get_tmp_dir(), 4849 suffix=".vcf.gz", 4850 delete=True, 4851 ) 4852 tmp_vcf_name = tmp_vcf.name 4853 4854 # VCF header 4855 vcf_reader = self.get_header() 4856 log.debug("Initial header: " + str(vcf_reader.infos)) 4857 4858 # Existing annotations 4859 for vcf_annotation in self.get_header().infos: 4860 4861 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 4862 log.debug( 4863 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 4864 ) 4865 4866 # Memory limit 4867 # if config.get("memory", None): 4868 # memory_limit = config.get("memory", "8G") 4869 # else: 4870 # memory_limit = "8G" 4871 memory_limit = self.get_memory("8G") 4872 log.debug(f"memory_limit: {memory_limit}") 4873 4874 # snpEff java options 4875 snpeff_java_options = ( 4876 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4877 ) 4878 log.debug(f"Exomiser java options: {snpeff_java_options}") 4879 4880 force_update_annotation = True 4881 4882 if "ANN" not in self.get_header().infos or force_update_annotation: 4883 4884 # Check snpEff database 4885 log.debug(f"Check snpEff databases {[assembly]}") 4886 databases_download_snpeff( 4887 folder=snpeff_databases, assemblies=[assembly], config=config 4888 ) 4889 4890 # Export VCF file 4891 self.export_variant_vcf( 4892 vcf_file=tmp_vcf_name, 4893 remove_info=True, 4894 add_samples=False, 4895 index=True, 4896 ) 4897 4898 # Tmp file 4899 err_files = [] 4900 tmp_annotate_vcf = NamedTemporaryFile( 4901 prefix=self.get_prefix(), 4902 dir=self.get_tmp_dir(), 4903 suffix=".vcf", 4904 delete=False, 4905 ) 4906 tmp_annotate_vcf_name = tmp_annotate_vcf.name 4907 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 4908 err_files.append(tmp_annotate_vcf_name_err) 4909 4910 # Command 4911 snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}" 4912 log.debug(f"Annotation - snpEff command: {snpeff_command}") 4913 run_parallel_commands([snpeff_command], 1) 4914 4915 # Error messages 4916 log.info(f"Error/Warning messages:") 4917 error_message_command_all = [] 4918 error_message_command_warning = [] 4919 error_message_command_err = [] 4920 for err_file in err_files: 4921 with open(err_file, "r") as f: 4922 for line in f: 4923 message = line.strip() 4924 error_message_command_all.append(message) 4925 if line.startswith("[W::"): 4926 error_message_command_warning.append(message) 4927 if line.startswith("[E::"): 4928 error_message_command_err.append(f"{err_file}: " + message) 4929 # log info 4930 for message in list( 4931 set(error_message_command_err + error_message_command_warning) 4932 ): 4933 log.info(f" {message}") 4934 # debug info 4935 for message in list(set(error_message_command_all)): 4936 log.debug(f" {message}") 4937 # failed 4938 if len(error_message_command_err): 4939 log.error("Annotation failed: Error in commands") 4940 raise ValueError("Annotation failed: Error in commands") 4941 4942 # Find annotation in header 4943 with open(tmp_annotate_vcf_name, "rt") as f: 4944 header_list = self.read_vcf_header(f) 4945 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 4946 4947 for ann in annovar_vcf_header.infos: 4948 if ann not in self.get_header().infos: 4949 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 4950 4951 # Update variants 4952 log.info(f"Annotation - Updating...") 4953 self.update_from_vcf(tmp_annotate_vcf_name) 4954 4955 else: 4956 if "ANN" in self.get_header().infos: 4957 log.debug(f"Existing snpEff annotations in VCF") 4958 if force_update_annotation: 4959 log.debug(f"Existing snpEff annotations in VCF - annotation forced") 4960 4961 def annotation_annovar(self, threads: int = None) -> None: 4962 """ 4963 It takes a VCF file, annotates it with Annovar, and then updates the database with the new 4964 annotations 4965 4966 :param threads: number of threads to use 4967 :return: the value of the variable "return_value". 4968 """ 4969 4970 # DEBUG 4971 log.debug("Start annotation with Annovar databases") 4972 4973 # Threads 4974 if not threads: 4975 threads = self.get_threads() 4976 log.debug("Threads: " + str(threads)) 4977 4978 # Tmp en Err files 4979 tmp_files = [] 4980 err_files = [] 4981 4982 # DEBUG 4983 delete_tmp = True 4984 if self.get_config().get("verbosity", "warning") in ["debug"]: 4985 delete_tmp = False 4986 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4987 4988 # Config 4989 config = self.get_config() 4990 log.debug("Config: " + str(config)) 4991 4992 # Config - Folders - Databases 4993 databases_folders = ( 4994 config.get("folders", {}).get("databases", {}).get("annovar", ["."]) 4995 ) 4996 log.debug("Databases annotations: " + str(databases_folders)) 4997 4998 # Config - annovar bin command 4999 annovar_bin_command = get_bin_command( 5000 bin="table_annovar.pl", 5001 tool="annovar", 5002 bin_type="perl", 5003 config=config, 5004 default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar", 5005 ) 5006 if not annovar_bin_command: 5007 msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'" 5008 log.error(msg_err) 5009 raise ValueError(msg_err) 5010 5011 # Config - BCFTools bin command 5012 bcftools_bin_command = get_bin_command( 5013 bin="bcftools", 5014 tool="bcftools", 5015 bin_type="bin", 5016 config=config, 5017 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 5018 ) 5019 if not bcftools_bin_command: 5020 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 5021 log.error(msg_err) 5022 raise ValueError(msg_err) 5023 5024 # Config - annovar databases 5025 annovar_databases = ( 5026 config.get("folders", {}) 5027 .get("databases", {}) 5028 .get("annovar", DEFAULT_ANNOVAR_FOLDER) 5029 ) 5030 annovar_databases = full_path(annovar_databases) 5031 if annovar_databases != "" and not os.path.exists(annovar_databases): 5032 os.makedirs(annovar_databases) 5033 5034 # Param 5035 param = self.get_param() 5036 log.debug("Param: " + str(param)) 5037 5038 # Param - options 5039 options = param.get("annotation", {}).get("annovar", {}).get("options", {}) 5040 log.debug("Options: " + str(options)) 5041 5042 # Param - annotations 5043 annotations = ( 5044 param.get("annotation", {}).get("annovar", {}).get("annotations", {}) 5045 ) 5046 log.debug("Annotations: " + str(annotations)) 5047 5048 # Param - Assembly 5049 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5050 5051 # Annovar database assembly 5052 annovar_databases_assembly = f"{annovar_databases}/{assembly}" 5053 if annovar_databases_assembly != "" and not os.path.exists( 5054 annovar_databases_assembly 5055 ): 5056 os.makedirs(annovar_databases_assembly) 5057 5058 # Data 5059 table_variants = self.get_table_variants() 5060 5061 # Check if not empty 5062 log.debug("Check if not empty") 5063 sql_query_chromosomes = ( 5064 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5065 ) 5066 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 5067 if not sql_query_chromosomes_df["count"][0]: 5068 log.info(f"VCF empty") 5069 return 5070 5071 # VCF header 5072 vcf_reader = self.get_header() 5073 log.debug("Initial header: " + str(vcf_reader.infos)) 5074 5075 # Existing annotations 5076 for vcf_annotation in self.get_header().infos: 5077 5078 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5079 log.debug( 5080 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5081 ) 5082 5083 force_update_annotation = True 5084 5085 if annotations: 5086 5087 commands = [] 5088 tmp_annotates_vcf_name_list = [] 5089 5090 # Export in VCF 5091 log.debug("Create initial file to annotate") 5092 tmp_vcf = NamedTemporaryFile( 5093 prefix=self.get_prefix(), 5094 dir=self.get_tmp_dir(), 5095 suffix=".vcf.gz", 5096 delete=False, 5097 ) 5098 tmp_vcf_name = tmp_vcf.name 5099 tmp_files.append(tmp_vcf_name) 5100 tmp_files.append(tmp_vcf_name + ".tbi") 5101 5102 # Export VCF file 5103 self.export_variant_vcf( 5104 vcf_file=tmp_vcf_name, 5105 remove_info=".", 5106 add_samples=False, 5107 index=True, 5108 ) 5109 5110 # Create file for field rename 5111 log.debug("Create file for field rename") 5112 tmp_rename = NamedTemporaryFile( 5113 prefix=self.get_prefix(), 5114 dir=self.get_tmp_dir(), 5115 suffix=".rename", 5116 delete=False, 5117 ) 5118 tmp_rename_name = tmp_rename.name 5119 tmp_files.append(tmp_rename_name) 5120 5121 # Check Annovar database 5122 log.debug( 5123 f"Check Annovar databases {[assembly]}: {list(annotations.keys())}" 5124 ) 5125 databases_download_annovar( 5126 folder=annovar_databases, 5127 files=list(annotations.keys()), 5128 assemblies=[assembly], 5129 ) 5130 5131 for annotation in annotations: 5132 annotation_fields = annotations[annotation] 5133 5134 if not annotation_fields: 5135 annotation_fields = {"INFO": None} 5136 5137 log.info(f"Annotations Annovar - database '{annotation}'") 5138 log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}") 5139 5140 # Tmp file for annovar 5141 err_files = [] 5142 tmp_annotate_vcf_directory = TemporaryDirectory( 5143 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar" 5144 ) 5145 tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar" 5146 tmp_annotate_vcf_name_annovar = ( 5147 tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf" 5148 ) 5149 tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err" 5150 err_files.append(tmp_annotate_vcf_name_err) 5151 tmp_files.append(tmp_annotate_vcf_name_err) 5152 5153 # Tmp file final vcf annotated by annovar 5154 tmp_annotate_vcf = NamedTemporaryFile( 5155 prefix=self.get_prefix(), 5156 dir=self.get_tmp_dir(), 5157 suffix=".vcf.gz", 5158 delete=False, 5159 ) 5160 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5161 tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name) 5162 tmp_files.append(tmp_annotate_vcf_name) 5163 tmp_files.append(tmp_annotate_vcf_name + ".tbi") 5164 5165 # Number of fields 5166 annotation_list = [] 5167 annotation_renamed_list = [] 5168 5169 for annotation_field in annotation_fields: 5170 5171 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 5172 annotation_fields_new_name = annotation_fields.get( 5173 annotation_field, annotation_field 5174 ) 5175 if not annotation_fields_new_name: 5176 annotation_fields_new_name = annotation_field 5177 5178 if ( 5179 force_update_annotation 5180 or annotation_fields_new_name not in self.get_header().infos 5181 ): 5182 annotation_list.append(annotation_field) 5183 annotation_renamed_list.append(annotation_fields_new_name) 5184 else: # annotation_fields_new_name in self.get_header().infos and not force_update_annotation: 5185 log.warning( 5186 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 5187 ) 5188 5189 # Add rename info 5190 run_parallel_commands( 5191 [ 5192 f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}" 5193 ], 5194 1, 5195 ) 5196 5197 # log.debug("fields_to_removed: " + str(fields_to_removed)) 5198 log.debug("annotation_list: " + str(annotation_list)) 5199 5200 # protocol 5201 protocol = annotation 5202 5203 # argument 5204 argument = "" 5205 5206 # operation 5207 operation = "f" 5208 if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith( 5209 "ensGene" 5210 ): 5211 operation = "g" 5212 if options.get("genebase", None): 5213 argument = f"""'{options.get("genebase","")}'""" 5214 elif annotation in ["cytoBand"]: 5215 operation = "r" 5216 5217 # argument option 5218 argument_option = "" 5219 if argument != "": 5220 argument_option = " --argument " + argument 5221 5222 # command options 5223 command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """ # --intronhgvs 10 5224 for option in options: 5225 if option not in ["genebase"]: 5226 command_options += f""" --{option}={options[option]}""" 5227 5228 # Command 5229 5230 # Command - Annovar 5231 command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """ 5232 tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf") 5233 5234 # Command - start pipe 5235 command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """ 5236 5237 # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!) 5238 command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """ 5239 5240 # Command - Special characters (refGene annotation) 5241 command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """ 5242 5243 # Command - Clean empty fields (with value ".") 5244 command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """ 5245 5246 # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file 5247 annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"] 5248 if "ALL" not in annotation_list and "INFO" not in annotation_list: 5249 # for ann in annotation_renamed_list: 5250 for ann in annotation_list: 5251 annovar_fields_to_keep.append(f"^INFO/{ann}") 5252 5253 command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """ 5254 5255 # Command - indexing 5256 command_annovar += f""" && tabix {tmp_annotate_vcf_name} """ 5257 5258 log.debug(f"Annotation - Annovar command: {command_annovar}") 5259 run_parallel_commands([command_annovar], 1) 5260 5261 # Error messages 5262 log.info(f"Error/Warning messages:") 5263 error_message_command_all = [] 5264 error_message_command_warning = [] 5265 error_message_command_err = [] 5266 for err_file in err_files: 5267 with open(err_file, "r") as f: 5268 for line in f: 5269 message = line.strip() 5270 error_message_command_all.append(message) 5271 if line.startswith("[W::") or line.startswith("WARNING"): 5272 error_message_command_warning.append(message) 5273 if line.startswith("[E::") or line.startswith("ERROR"): 5274 error_message_command_err.append( 5275 f"{err_file}: " + message 5276 ) 5277 # log info 5278 for message in list( 5279 set(error_message_command_err + error_message_command_warning) 5280 ): 5281 log.info(f" {message}") 5282 # debug info 5283 for message in list(set(error_message_command_all)): 5284 log.debug(f" {message}") 5285 # failed 5286 if len(error_message_command_err): 5287 log.error("Annotation failed: Error in commands") 5288 raise ValueError("Annotation failed: Error in commands") 5289 5290 if tmp_annotates_vcf_name_list: 5291 5292 # List of annotated files 5293 tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list) 5294 5295 # Tmp file 5296 tmp_annotate_vcf = NamedTemporaryFile( 5297 prefix=self.get_prefix(), 5298 dir=self.get_tmp_dir(), 5299 suffix=".vcf.gz", 5300 delete=False, 5301 ) 5302 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5303 tmp_files.append(tmp_annotate_vcf_name) 5304 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5305 err_files.append(tmp_annotate_vcf_name_err) 5306 tmp_files.append(tmp_annotate_vcf_name_err) 5307 5308 # Command merge 5309 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} " 5310 log.info( 5311 f"Annotation Annovar - Annotation merging " 5312 + str(len(tmp_annotates_vcf_name_list)) 5313 + " annotated files" 5314 ) 5315 log.debug(f"Annotation - merge command: {merge_command}") 5316 run_parallel_commands([merge_command], 1) 5317 5318 # Find annotation in header 5319 with bgzf.open(tmp_annotate_vcf_name, "rt") as f: 5320 header_list = self.read_vcf_header(f) 5321 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5322 5323 for ann in annovar_vcf_header.infos: 5324 if ann not in self.get_header().infos: 5325 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5326 5327 # Update variants 5328 log.info(f"Annotation Annovar - Updating...") 5329 self.update_from_vcf(tmp_annotate_vcf_name) 5330 5331 # Clean files 5332 # Tmp file remove command 5333 if True: 5334 tmp_files_remove_command = "" 5335 if tmp_files: 5336 tmp_files_remove_command = " ".join(tmp_files) 5337 clean_command = f" rm -f {tmp_files_remove_command} " 5338 log.debug(f"Annotation Annovar - Annotation cleaning ") 5339 log.debug(f"Annotation - cleaning command: {clean_command}") 5340 run_parallel_commands([clean_command], 1) 5341 5342 # Parquet 5343 def annotation_parquet(self, threads: int = None) -> None: 5344 """ 5345 It takes a VCF file, and annotates it with a parquet file 5346 5347 :param threads: number of threads to use for the annotation 5348 :return: the value of the variable "result". 5349 """ 5350 5351 # DEBUG 5352 log.debug("Start annotation with parquet databases") 5353 5354 # Threads 5355 if not threads: 5356 threads = self.get_threads() 5357 log.debug("Threads: " + str(threads)) 5358 5359 # DEBUG 5360 delete_tmp = True 5361 if self.get_config().get("verbosity", "warning") in ["debug"]: 5362 delete_tmp = False 5363 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5364 5365 # Config 5366 databases_folders = set( 5367 self.get_config() 5368 .get("folders", {}) 5369 .get("databases", {}) 5370 .get("annotations", ["."]) 5371 + self.get_config() 5372 .get("folders", {}) 5373 .get("databases", {}) 5374 .get("parquet", ["."]) 5375 ) 5376 log.debug("Databases annotations: " + str(databases_folders)) 5377 5378 # Param 5379 annotations = ( 5380 self.get_param() 5381 .get("annotation", {}) 5382 .get("parquet", {}) 5383 .get("annotations", None) 5384 ) 5385 log.debug("Annotations: " + str(annotations)) 5386 5387 # Assembly 5388 assembly = self.get_param().get( 5389 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 5390 ) 5391 5392 # Force Update Annotation 5393 force_update_annotation = ( 5394 self.get_param() 5395 .get("annotation", {}) 5396 .get("options", {}) 5397 .get("annotations_update", False) 5398 ) 5399 log.debug(f"force_update_annotation={force_update_annotation}") 5400 force_append_annotation = ( 5401 self.get_param() 5402 .get("annotation", {}) 5403 .get("options", {}) 5404 .get("annotations_append", False) 5405 ) 5406 log.debug(f"force_append_annotation={force_append_annotation}") 5407 5408 # Data 5409 table_variants = self.get_table_variants() 5410 5411 # Check if not empty 5412 log.debug("Check if not empty") 5413 sql_query_chromosomes_df = self.get_query_to_df( 5414 f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1""" 5415 ) 5416 if not sql_query_chromosomes_df["count"][0]: 5417 log.info(f"VCF empty") 5418 return 5419 5420 # VCF header 5421 vcf_reader = self.get_header() 5422 log.debug("Initial header: " + str(vcf_reader.infos)) 5423 5424 # Nb Variants POS 5425 log.debug("NB Variants Start") 5426 nb_variants = self.conn.execute( 5427 f"SELECT count(*) AS count FROM variants" 5428 ).fetchdf()["count"][0] 5429 log.debug("NB Variants Stop") 5430 5431 # Existing annotations 5432 for vcf_annotation in self.get_header().infos: 5433 5434 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5435 log.debug( 5436 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5437 ) 5438 5439 # Added columns 5440 added_columns = [] 5441 5442 # drop indexes 5443 log.debug(f"Drop indexes...") 5444 self.drop_indexes() 5445 5446 if annotations: 5447 5448 if "ALL" in annotations: 5449 5450 all_param = annotations.get("ALL", {}) 5451 all_param_formats = all_param.get("formats", None) 5452 all_param_releases = all_param.get("releases", None) 5453 5454 databases_infos_dict = self.scan_databases( 5455 database_formats=all_param_formats, 5456 database_releases=all_param_releases, 5457 ) 5458 for database_infos in databases_infos_dict.keys(): 5459 if database_infos not in annotations: 5460 annotations[database_infos] = {"INFO": None} 5461 5462 for annotation in annotations: 5463 5464 if annotation in ["ALL"]: 5465 continue 5466 5467 # Annotation Name 5468 annotation_name = os.path.basename(annotation) 5469 5470 # Annotation fields 5471 annotation_fields = annotations[annotation] 5472 if not annotation_fields: 5473 annotation_fields = {"INFO": None} 5474 5475 log.debug(f"Annotation '{annotation_name}'") 5476 log.debug( 5477 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 5478 ) 5479 5480 # Create Database 5481 database = Database( 5482 database=annotation, 5483 databases_folders=databases_folders, 5484 assembly=assembly, 5485 ) 5486 5487 # Find files 5488 parquet_file = database.get_database() 5489 parquet_hdr_file = database.get_header_file() 5490 parquet_type = database.get_type() 5491 5492 # Check if files exists 5493 if not parquet_file or not parquet_hdr_file: 5494 log.error("Annotation failed: file not found") 5495 raise ValueError("Annotation failed: file not found") 5496 else: 5497 # Get parquet connexion 5498 parquet_sql_attach = database.get_sql_database_attach( 5499 output="query" 5500 ) 5501 if parquet_sql_attach: 5502 self.conn.execute(parquet_sql_attach) 5503 parquet_file_link = database.get_sql_database_link() 5504 # Log 5505 log.debug( 5506 f"Annotation '{annotation_name}' - file: " 5507 + str(parquet_file) 5508 + " and " 5509 + str(parquet_hdr_file) 5510 ) 5511 5512 # Database full header columns 5513 parquet_hdr_vcf_header_columns = database.get_header_file_columns( 5514 parquet_hdr_file 5515 ) 5516 # Log 5517 log.debug( 5518 "Annotation database header columns : " 5519 + str(parquet_hdr_vcf_header_columns) 5520 ) 5521 5522 # Load header as VCF object 5523 parquet_hdr_vcf_header_infos = database.get_header().infos 5524 # Log 5525 log.debug( 5526 "Annotation database header: " 5527 + str(parquet_hdr_vcf_header_infos) 5528 ) 5529 5530 # Get extra infos 5531 parquet_columns = database.get_extra_columns() 5532 # Log 5533 log.debug("Annotation database Columns: " + str(parquet_columns)) 5534 5535 # Add extra columns if "ALL" in annotation_fields 5536 # if "ALL" in annotation_fields: 5537 # allow_add_extra_column = True 5538 if "ALL" in annotation_fields and database.get_extra_columns(): 5539 for extra_column in database.get_extra_columns(): 5540 if ( 5541 extra_column not in annotation_fields 5542 and extra_column.replace("INFO/", "") 5543 not in parquet_hdr_vcf_header_infos 5544 ): 5545 parquet_hdr_vcf_header_infos[extra_column] = ( 5546 vcf.parser._Info( 5547 extra_column, 5548 ".", 5549 "String", 5550 f"{extra_column} description", 5551 "unknown", 5552 "unknown", 5553 self.code_type_map["String"], 5554 ) 5555 ) 5556 5557 # For all fields in database 5558 annotation_fields_all = False 5559 if "ALL" in annotation_fields or "INFO" in annotation_fields: 5560 annotation_fields_all = True 5561 annotation_fields = { 5562 key: key for key in parquet_hdr_vcf_header_infos 5563 } 5564 5565 log.debug( 5566 "Annotation database header - All annotations added: " 5567 + str(annotation_fields) 5568 ) 5569 5570 # Init 5571 5572 # List of annotation fields to use 5573 sql_query_annotation_update_info_sets = [] 5574 5575 # List of annotation to agregate 5576 sql_query_annotation_to_agregate = [] 5577 5578 # Number of fields 5579 nb_annotation_field = 0 5580 5581 # Annotation fields processed 5582 annotation_fields_processed = [] 5583 5584 # Columns mapping 5585 map_columns = database.map_columns( 5586 columns=annotation_fields, prefixes=["INFO/"] 5587 ) 5588 5589 # Query dict for fields to remove (update option) 5590 query_dict_remove = {} 5591 5592 # Fetch Anotation fields 5593 for annotation_field in annotation_fields: 5594 5595 # annotation_field_column 5596 annotation_field_column = map_columns.get( 5597 annotation_field, "INFO" 5598 ) 5599 5600 # field new name, if parametered 5601 annotation_fields_new_name = annotation_fields.get( 5602 annotation_field, annotation_field 5603 ) 5604 if not annotation_fields_new_name: 5605 annotation_fields_new_name = annotation_field 5606 5607 # To annotate 5608 # force_update_annotation = True 5609 # force_append_annotation = True 5610 # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)): 5611 if annotation_field in parquet_hdr_vcf_header_infos and ( 5612 force_update_annotation 5613 or force_append_annotation 5614 or ( 5615 annotation_fields_new_name 5616 not in self.get_header().infos 5617 ) 5618 ): 5619 5620 # Add field to annotation to process list 5621 annotation_fields_processed.append( 5622 annotation_fields_new_name 5623 ) 5624 5625 # explode infos for the field 5626 annotation_fields_new_name_info_msg = "" 5627 if ( 5628 force_update_annotation 5629 and annotation_fields_new_name 5630 in self.get_header().infos 5631 ): 5632 # Remove field from INFO 5633 query = f""" 5634 UPDATE {table_variants} as table_variants 5635 SET INFO = REGEXP_REPLACE( 5636 concat(table_variants.INFO,''), 5637 ';*{annotation_fields_new_name}=[^;]*', 5638 '' 5639 ) 5640 WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%' 5641 """ 5642 annotation_fields_new_name_info_msg = " [update]" 5643 query_dict_remove[ 5644 f"remove 'INFO/{annotation_fields_new_name}'" 5645 ] = query 5646 5647 # Sep between fields in INFO 5648 nb_annotation_field += 1 5649 if nb_annotation_field > 1: 5650 annotation_field_sep = ";" 5651 else: 5652 annotation_field_sep = "" 5653 5654 log.info( 5655 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}" 5656 ) 5657 5658 # Add INFO field to header 5659 parquet_hdr_vcf_header_infos_number = ( 5660 parquet_hdr_vcf_header_infos[annotation_field].num 5661 or "." 5662 ) 5663 parquet_hdr_vcf_header_infos_type = ( 5664 parquet_hdr_vcf_header_infos[annotation_field].type 5665 or "String" 5666 ) 5667 parquet_hdr_vcf_header_infos_description = ( 5668 parquet_hdr_vcf_header_infos[annotation_field].desc 5669 or f"{annotation_field} description" 5670 ) 5671 parquet_hdr_vcf_header_infos_source = ( 5672 parquet_hdr_vcf_header_infos[annotation_field].source 5673 or "unknown" 5674 ) 5675 parquet_hdr_vcf_header_infos_version = ( 5676 parquet_hdr_vcf_header_infos[annotation_field].version 5677 or "unknown" 5678 ) 5679 5680 vcf_reader.infos[annotation_fields_new_name] = ( 5681 vcf.parser._Info( 5682 annotation_fields_new_name, 5683 parquet_hdr_vcf_header_infos_number, 5684 parquet_hdr_vcf_header_infos_type, 5685 parquet_hdr_vcf_header_infos_description, 5686 parquet_hdr_vcf_header_infos_source, 5687 parquet_hdr_vcf_header_infos_version, 5688 self.code_type_map[ 5689 parquet_hdr_vcf_header_infos_type 5690 ], 5691 ) 5692 ) 5693 5694 # Append 5695 if force_append_annotation: 5696 query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """ 5697 else: 5698 query_case_when_append = "" 5699 5700 # Annotation/Update query fields 5701 # Found in INFO column 5702 if ( 5703 annotation_field_column == "INFO" 5704 and "INFO" in parquet_hdr_vcf_header_columns 5705 ): 5706 sql_query_annotation_update_info_sets.append( 5707 f""" 5708 CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append} 5709 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1)) 5710 ELSE '' 5711 END 5712 """ 5713 ) 5714 # Found in a specific column 5715 else: 5716 sql_query_annotation_update_info_sets.append( 5717 f""" 5718 CASE WHEN table_parquet."{annotation_field_column}" NOT IN ('','.') {query_case_when_append} 5719 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(table_parquet."{annotation_field_column}", ';', ',')) 5720 ELSE '' 5721 END 5722 """ 5723 ) 5724 sql_query_annotation_to_agregate.append( 5725 f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ 5726 ) 5727 5728 # Not to annotate 5729 else: 5730 5731 if force_update_annotation: 5732 annotation_message = "forced" 5733 else: 5734 annotation_message = "skipped" 5735 5736 if annotation_field not in parquet_hdr_vcf_header_infos: 5737 log.warning( 5738 f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file" 5739 ) 5740 if annotation_fields_new_name in self.get_header().infos: 5741 log.warning( 5742 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})" 5743 ) 5744 5745 # Check if ALL fields have to be annotated. Thus concat all INFO field 5746 # allow_annotation_full_info = True 5747 allow_annotation_full_info = not force_append_annotation 5748 5749 if parquet_type in ["regions"]: 5750 allow_annotation_full_info = False 5751 5752 if ( 5753 allow_annotation_full_info 5754 and nb_annotation_field == len(annotation_fields) 5755 and annotation_fields_all 5756 and ( 5757 "INFO" in parquet_hdr_vcf_header_columns 5758 and "INFO" in database.get_extra_columns() 5759 ) 5760 ): 5761 log.debug("Column INFO annotation enabled") 5762 sql_query_annotation_update_info_sets = [] 5763 sql_query_annotation_update_info_sets.append( 5764 f" table_parquet.INFO " 5765 ) 5766 5767 if sql_query_annotation_update_info_sets: 5768 5769 # Annotate 5770 log.info(f"Annotation '{annotation_name}' - Annotation...") 5771 5772 # Join query annotation update info sets for SQL 5773 sql_query_annotation_update_info_sets_sql = ",".join( 5774 sql_query_annotation_update_info_sets 5775 ) 5776 5777 # Check chromosomes list (and variants infos) 5778 sql_query_chromosomes = f""" 5779 SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants 5780 FROM {table_variants} as table_variants 5781 GROUP BY table_variants."#CHROM" 5782 ORDER BY table_variants."#CHROM" 5783 """ 5784 sql_query_chromosomes_df = self.conn.execute( 5785 sql_query_chromosomes 5786 ).df() 5787 sql_query_chromosomes_dict = { 5788 entry["CHROM"]: { 5789 "count": entry["count_variants"], 5790 "min": entry["min_variants"], 5791 "max": entry["max_variants"], 5792 } 5793 for index, entry in sql_query_chromosomes_df.iterrows() 5794 } 5795 5796 # Init 5797 nb_of_query = 0 5798 nb_of_variant_annotated = 0 5799 query_dict = query_dict_remove 5800 5801 # for chrom in sql_query_chromosomes_df["CHROM"]: 5802 for chrom in sql_query_chromosomes_dict: 5803 5804 # Number of variant by chromosome 5805 nb_of_variant_by_chrom = sql_query_chromosomes_dict.get( 5806 chrom, {} 5807 ).get("count", 0) 5808 5809 log.debug( 5810 f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..." 5811 ) 5812 5813 # Annotation with regions database 5814 if parquet_type in ["regions"]: 5815 sql_query_annotation_from_clause = f""" 5816 FROM ( 5817 SELECT 5818 '{chrom}' AS \"#CHROM\", 5819 table_variants_from.\"POS\" AS \"POS\", 5820 {",".join(sql_query_annotation_to_agregate)} 5821 FROM {table_variants} as table_variants_from 5822 LEFT JOIN {parquet_file_link} as table_parquet_from ON ( 5823 table_parquet_from."#CHROM" = '{chrom}' 5824 AND table_variants_from.\"POS\" <= table_parquet_from.\"END\" 5825 AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1) 5826 OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1) 5827 ) 5828 ) 5829 WHERE table_variants_from.\"#CHROM\" in ('{chrom}') 5830 GROUP BY table_variants_from.\"POS\" 5831 ) 5832 as table_parquet 5833 """ 5834 5835 sql_query_annotation_where_clause = """ 5836 table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 5837 AND table_parquet.\"POS\" = table_variants.\"POS\" 5838 """ 5839 5840 # Annotation with variants database 5841 else: 5842 sql_query_annotation_from_clause = f""" 5843 FROM {parquet_file_link} as table_parquet 5844 """ 5845 sql_query_annotation_where_clause = f""" 5846 table_variants."#CHROM" = '{chrom}' 5847 AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 5848 AND table_parquet.\"POS\" = table_variants.\"POS\" 5849 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 5850 AND table_parquet.\"REF\" = table_variants.\"REF\" 5851 """ 5852 5853 # Create update query 5854 sql_query_annotation_chrom_interval_pos = f""" 5855 UPDATE {table_variants} as table_variants 5856 SET INFO = 5857 concat( 5858 CASE WHEN table_variants.INFO NOT IN ('','.') 5859 THEN table_variants.INFO 5860 ELSE '' 5861 END 5862 , 5863 CASE WHEN table_variants.INFO NOT IN ('','.') 5864 AND ( 5865 concat({sql_query_annotation_update_info_sets_sql}) 5866 ) 5867 NOT IN ('','.') 5868 THEN ';' 5869 ELSE '' 5870 END 5871 , 5872 {sql_query_annotation_update_info_sets_sql} 5873 ) 5874 {sql_query_annotation_from_clause} 5875 WHERE {sql_query_annotation_where_clause} 5876 ; 5877 """ 5878 5879 # Add update query to dict 5880 query_dict[ 5881 f"{chrom} [{nb_of_variant_by_chrom} variants]" 5882 ] = sql_query_annotation_chrom_interval_pos 5883 5884 nb_of_query = len(query_dict) 5885 num_query = 0 5886 5887 # SET max_expression_depth TO x 5888 self.conn.execute("SET max_expression_depth TO 10000") 5889 5890 for query_name in query_dict: 5891 query = query_dict[query_name] 5892 num_query += 1 5893 log.info( 5894 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..." 5895 ) 5896 result = self.conn.execute(query) 5897 nb_of_variant_annotated_by_query = result.df()["Count"][0] 5898 nb_of_variant_annotated += nb_of_variant_annotated_by_query 5899 log.info( 5900 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated" 5901 ) 5902 5903 log.info( 5904 f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)" 5905 ) 5906 5907 else: 5908 5909 log.info( 5910 f"Annotation '{annotation_name}' - No Annotations available" 5911 ) 5912 5913 log.debug("Final header: " + str(vcf_reader.infos)) 5914 5915 # Remove added columns 5916 for added_column in added_columns: 5917 self.drop_column(column=added_column) 5918 5919 def annotation_splice(self, threads: int = None) -> None: 5920 """ 5921 This function annotate with snpEff 5922 5923 :param threads: The number of threads to use 5924 :return: the value of the variable "return_value". 5925 """ 5926 5927 # DEBUG 5928 log.debug("Start annotation with splice tools") 5929 5930 # Threads 5931 if not threads: 5932 threads = self.get_threads() 5933 log.debug("Threads: " + str(threads)) 5934 5935 # DEBUG 5936 delete_tmp = True 5937 if self.get_config().get("verbosity", "warning") in ["debug"]: 5938 delete_tmp = False 5939 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5940 5941 # Config 5942 config = self.get_config() 5943 log.debug("Config: " + str(config)) 5944 splice_config = config.get("tools", {}).get("splice", {}) 5945 if not splice_config: 5946 splice_config = DEFAULT_TOOLS_BIN.get("splice", {}) 5947 if not splice_config: 5948 msg_err = "No Splice tool config" 5949 log.error(msg_err) 5950 raise ValueError(msg_err) 5951 log.debug(f"splice_config={splice_config}") 5952 5953 # Config - Folders - Databases 5954 databases_folders = ( 5955 config.get("folders", {}).get("databases", {}).get("splice", ["."]) 5956 ) 5957 log.debug("Databases annotations: " + str(databases_folders)) 5958 5959 # Splice docker image 5960 splice_docker_image = splice_config.get("docker").get("image") 5961 5962 # Pull splice image if it's not already there 5963 if not check_docker_image_exists(splice_docker_image): 5964 log.warning( 5965 f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub" 5966 ) 5967 try: 5968 command(f"docker pull {splice_config.get('docker').get('image')}") 5969 except subprocess.CalledProcessError: 5970 msg_err = f"Unable to find docker {splice_docker_image} on dockerhub" 5971 log.error(msg_err) 5972 raise ValueError(msg_err) 5973 return None 5974 5975 # Config - splice databases 5976 splice_databases = ( 5977 config.get("folders", {}) 5978 .get("databases", {}) 5979 .get("splice", DEFAULT_SPLICE_FOLDER) 5980 ) 5981 splice_databases = full_path(splice_databases) 5982 5983 # Param 5984 param = self.get_param() 5985 log.debug("Param: " + str(param)) 5986 5987 # Param 5988 options = param.get("annotation", {}).get("splice", {}) 5989 log.debug("Options: " + str(options)) 5990 5991 # Data 5992 table_variants = self.get_table_variants() 5993 5994 # Check if not empty 5995 log.debug("Check if not empty") 5996 sql_query_chromosomes = ( 5997 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5998 ) 5999 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 6000 log.info("VCF empty") 6001 return None 6002 6003 # Export in VCF 6004 log.debug("Create initial file to annotate") 6005 6006 # Create output folder 6007 output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}") 6008 if not os.path.exists(output_folder): 6009 Path(output_folder).mkdir(parents=True, exist_ok=True) 6010 6011 # Create tmp VCF file 6012 tmp_vcf = NamedTemporaryFile( 6013 prefix=self.get_prefix(), 6014 dir=output_folder, 6015 suffix=".vcf", 6016 delete=False, 6017 ) 6018 tmp_vcf_name = tmp_vcf.name 6019 6020 # VCF header 6021 header = self.get_header() 6022 6023 # Existing annotations 6024 for vcf_annotation in self.get_header().infos: 6025 6026 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 6027 log.debug( 6028 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 6029 ) 6030 6031 # Memory limit 6032 if config.get("memory", None): 6033 memory_limit = config.get("memory", "8G").upper() 6034 # upper() 6035 else: 6036 memory_limit = "8G" 6037 log.debug(f"memory_limit: {memory_limit}") 6038 6039 # Check number of variants to annotate 6040 where_clause_regex_spliceai = r"SpliceAI_\w+" 6041 where_clause_regex_spip = r"SPiP_\w+" 6042 where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')""" 6043 df_list_of_variants_to_annotate = self.get_query_to_df( 6044 query=f""" SELECT * FROM variants {where_clause} """ 6045 ) 6046 if len(df_list_of_variants_to_annotate) == 0: 6047 log.warning( 6048 f"No variants to annotate with splice. Variants probably already annotated with splice" 6049 ) 6050 return None 6051 else: 6052 log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants") 6053 6054 # Export VCF file 6055 self.export_variant_vcf( 6056 vcf_file=tmp_vcf_name, 6057 remove_info=True, 6058 add_samples=True, 6059 index=False, 6060 where_clause=where_clause, 6061 ) 6062 6063 # Create docker container and launch splice analysis 6064 if splice_config: 6065 6066 # Splice mount folders 6067 mount_folders = splice_config.get("mount", {}) 6068 6069 # Genome mount 6070 mount_folders[ 6071 config.get("folders", {}) 6072 .get("databases", {}) 6073 .get("genomes", DEFAULT_GENOME_FOLDER) 6074 ] = "ro" 6075 6076 # SpliceAI mount 6077 mount_folders[ 6078 config.get("folders", {}) 6079 .get("databases", {}) 6080 .get("spliceai", DEFAULT_SPLICEAI_FOLDER) 6081 ] = "ro" 6082 6083 # Genome mount 6084 mount_folders[ 6085 config.get("folders", {}) 6086 .get("databases", {}) 6087 .get("spip", DEFAULT_SPIP_FOLDER) 6088 ] = "ro" 6089 6090 # Mount folders 6091 mount = [] 6092 6093 # Config mount 6094 mount = [ 6095 f"-v {full_path(path)}:{full_path(path)}:{mode}" 6096 for path, mode in mount_folders.items() 6097 ] 6098 6099 if any(value for value in splice_config.values() if value is None): 6100 log.warning("At least one splice config parameter is empty") 6101 return None 6102 6103 # Params in splice nf 6104 def check_values(dico: dict): 6105 """ 6106 Ensure parameters for NF splice pipeline 6107 """ 6108 for key, val in dico.items(): 6109 if key == "genome": 6110 if any( 6111 assemb in options.get("genome", {}) 6112 for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"] 6113 ): 6114 yield f"--{key} hg19" 6115 elif any( 6116 assemb in options.get("genome", {}) 6117 for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"] 6118 ): 6119 yield f"--{key} hg38" 6120 elif ( 6121 (isinstance(val, str) and val) 6122 or isinstance(val, int) 6123 or isinstance(val, bool) 6124 ): 6125 yield f"--{key} {val}" 6126 6127 # Genome 6128 genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY)) 6129 options["genome"] = genome 6130 6131 # NF params 6132 nf_params = [] 6133 6134 # Add options 6135 if options: 6136 nf_params = list(check_values(options)) 6137 log.debug(f"Splice NF params: {' '.join(nf_params)}") 6138 else: 6139 log.debug("No NF params provided") 6140 6141 # Add threads 6142 if "threads" not in options.keys(): 6143 nf_params.append(f"--threads {threads}") 6144 6145 # Genome path 6146 genome_path = find_genome( 6147 config.get("folders", {}) 6148 .get("databases", {}) 6149 .get("genomes", DEFAULT_GENOME_FOLDER), 6150 file=f"{genome}.fa", 6151 ) 6152 # Add genome path 6153 if not genome_path: 6154 raise ValueError( 6155 f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}" 6156 ) 6157 else: 6158 log.debug(f"Genome: {genome_path}") 6159 nf_params.append(f"--genome_path {genome_path}") 6160 6161 def splice_annotations(options: dict = {}, config: dict = {}) -> list: 6162 """ 6163 Setting up updated databases for SPiP and SpliceAI 6164 """ 6165 6166 try: 6167 6168 # SpliceAI assembly transcriptome 6169 spliceai_assembly = os.path.join( 6170 config.get("folders", {}) 6171 .get("databases", {}) 6172 .get("spliceai", {}), 6173 options.get("genome"), 6174 "transcriptome", 6175 ) 6176 spip_assembly = options.get("genome") 6177 6178 spip = find( 6179 f"transcriptome_{spip_assembly}.RData", 6180 config.get("folders", {}).get("databases", {}).get("spip", {}), 6181 ) 6182 spliceai = find("spliceai.refseq.txt", spliceai_assembly) 6183 log.debug(f"SPiP annotations: {spip}") 6184 log.debug(f"SpliceAI annotations: {spliceai}") 6185 if spip and spliceai: 6186 return [ 6187 f"--spip_transcriptome {spip}", 6188 f"--spliceai_annotations {spliceai}", 6189 ] 6190 else: 6191 # TODO crash and go on with basic annotations ? 6192 # raise ValueError( 6193 # "Can't find splice databases in configuration EXIT" 6194 # ) 6195 log.warning( 6196 "Can't find splice databases in configuration, use annotations file from image" 6197 ) 6198 except TypeError: 6199 log.warning( 6200 "Can't find splice databases in configuration, use annotations file from image" 6201 ) 6202 return [] 6203 6204 # Add options, check if transcriptome option have already beend provided 6205 if ( 6206 "spip_transcriptome" not in nf_params 6207 and "spliceai_transcriptome" not in nf_params 6208 ): 6209 splice_reference = splice_annotations(options, config) 6210 if splice_reference: 6211 nf_params.extend(splice_reference) 6212 6213 nf_params.append(f"--output_folder {output_folder}") 6214 6215 random_uuid = f"HOWARD-SPLICE-{get_random()}" 6216 cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline" 6217 log.debug(cmd) 6218 6219 splice_config["docker"]["command"] = cmd 6220 6221 docker_cmd = get_bin_command( 6222 tool="splice", 6223 bin_type="docker", 6224 config=config, 6225 default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker", 6226 add_options=f"--name {random_uuid} {' '.join(mount)}", 6227 ) 6228 6229 # Docker debug 6230 # if splice_config.get("rm_container"): 6231 # rm_container = "--rm" 6232 # else: 6233 # rm_container = "" 6234 # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}" 6235 6236 log.debug(docker_cmd) 6237 res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True) 6238 log.debug(res.stdout) 6239 if res.stderr: 6240 log.error(res.stderr) 6241 res.check_returncode() 6242 else: 6243 log.warning(f"Splice tool configuration not found: {config}") 6244 6245 # Update variants 6246 log.info("Annotation - Updating...") 6247 # Test find output vcf 6248 log.debug( 6249 f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6250 ) 6251 output_vcf = [] 6252 # Wrong folder to look in 6253 for files in os.listdir(os.path.dirname(tmp_vcf_name)): 6254 if ( 6255 files 6256 == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6257 ): 6258 output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files)) 6259 # log.debug(os.listdir(options.get("output_folder"))) 6260 log.debug(f"Splice annotated vcf: {output_vcf[0]}") 6261 if not output_vcf: 6262 log.debug( 6263 f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz" 6264 ) 6265 else: 6266 # Get new header from annotated vcf 6267 log.debug(f"Initial header: {len(header.infos)} fields") 6268 # Create new header with splice infos 6269 new_vcf = Variants(input=output_vcf[0]) 6270 new_vcf_header = new_vcf.get_header().infos 6271 for keys, infos in new_vcf_header.items(): 6272 if keys not in header.infos.keys(): 6273 header.infos[keys] = infos 6274 log.debug(f"New header: {len(header.infos)} fields") 6275 log.debug(f"Splice tmp output: {output_vcf[0]}") 6276 self.update_from_vcf(output_vcf[0]) 6277 6278 # Remove folder 6279 remove_if_exists(output_folder) 6280 6281 ### 6282 # Prioritization 6283 ### 6284 6285 def get_config_default(self, name: str) -> dict: 6286 """ 6287 The function `get_config_default` returns a dictionary containing default configurations for 6288 various calculations and prioritizations. 6289 6290 :param name: The `get_config_default` function returns a dictionary containing default 6291 configurations for different calculations and prioritizations. The `name` parameter is used to 6292 specify which specific configuration to retrieve from the dictionary 6293 :type name: str 6294 :return: The function `get_config_default` returns a dictionary containing default configuration 6295 settings for different calculations and prioritizations. The specific configuration settings are 6296 retrieved based on the input `name` parameter provided to the function. If the `name` parameter 6297 matches a key in the `config_default` dictionary, the corresponding configuration settings are 6298 returned. If there is no match, an empty dictionary is returned. 6299 """ 6300 6301 config_default = { 6302 "calculations": { 6303 "variant_chr_pos_alt_ref": { 6304 "type": "sql", 6305 "name": "variant_chr_pos_alt_ref", 6306 "description": "Create a variant ID with chromosome, position, alt and ref", 6307 "available": False, 6308 "output_column_name": "variant_chr_pos_alt_ref", 6309 "output_column_type": "String", 6310 "output_column_description": "variant ID with chromosome, position, alt and ref", 6311 "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """, 6312 "operation_info": True, 6313 }, 6314 "VARTYPE": { 6315 "type": "sql", 6316 "name": "VARTYPE", 6317 "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)", 6318 "available": True, 6319 "output_column_name": "VARTYPE", 6320 "output_column_type": "String", 6321 "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ", 6322 "operation_query": """ 6323 CASE 6324 WHEN "SVTYPE" NOT NULL THEN "SVTYPE" 6325 WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV' 6326 WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC' 6327 WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV' 6328 WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL' 6329 ELSE 'UNDEFINED' 6330 END 6331 """, 6332 "info_fields": ["SVTYPE"], 6333 "operation_info": True, 6334 }, 6335 "snpeff_hgvs": { 6336 "type": "python", 6337 "name": "snpeff_hgvs", 6338 "description": "HGVS nomenclatures from snpEff annotation", 6339 "available": True, 6340 "function_name": "calculation_extract_snpeff_hgvs", 6341 "function_params": ["snpeff_hgvs", "ANN"], 6342 }, 6343 "snpeff_ann_explode": { 6344 "type": "python", 6345 "name": "snpeff_ann_explode", 6346 "description": "Explode snpEff annotations with uniquify values", 6347 "available": True, 6348 "function_name": "calculation_snpeff_ann_explode", 6349 "function_params": [False, "fields", "snpeff_", "ANN"], 6350 }, 6351 "snpeff_ann_explode_uniquify": { 6352 "type": "python", 6353 "name": "snpeff_ann_explode_uniquify", 6354 "description": "Explode snpEff annotations", 6355 "available": True, 6356 "function_name": "calculation_snpeff_ann_explode", 6357 "function_params": [True, "fields", "snpeff_uniquify_", "ANN"], 6358 }, 6359 "snpeff_ann_explode_json": { 6360 "type": "python", 6361 "name": "snpeff_ann_explode_json", 6362 "description": "Explode snpEff annotations in JSON format", 6363 "available": True, 6364 "function_name": "calculation_snpeff_ann_explode", 6365 "function_params": [False, "JSON", "snpeff_json", "ANN"], 6366 }, 6367 "NOMEN": { 6368 "type": "python", 6369 "name": "NOMEN", 6370 "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field", 6371 "available": True, 6372 "function_name": "calculation_extract_nomen", 6373 "function_params": [], 6374 }, 6375 "FINDBYPIPELINE": { 6376 "type": "python", 6377 "name": "FINDBYPIPELINE", 6378 "description": "Number of pipeline that identify the variant (for multi pipeline VCF)", 6379 "available": True, 6380 "function_name": "calculation_find_by_pipeline", 6381 "function_params": ["findbypipeline"], 6382 }, 6383 "FINDBYSAMPLE": { 6384 "type": "python", 6385 "name": "FINDBYSAMPLE", 6386 "description": "Number of sample that have a genotype for the variant (for multi sample VCF)", 6387 "available": True, 6388 "function_name": "calculation_find_by_pipeline", 6389 "function_params": ["findbysample"], 6390 }, 6391 "GENOTYPECONCORDANCE": { 6392 "type": "python", 6393 "name": "GENOTYPECONCORDANCE", 6394 "description": "Concordance of genotype for multi caller VCF", 6395 "available": True, 6396 "function_name": "calculation_genotype_concordance", 6397 "function_params": [], 6398 }, 6399 "BARCODE": { 6400 "type": "python", 6401 "name": "BARCODE", 6402 "description": "BARCODE as VaRank tool", 6403 "available": True, 6404 "function_name": "calculation_barcode", 6405 "function_params": [], 6406 }, 6407 "BARCODEFAMILY": { 6408 "type": "python", 6409 "name": "BARCODEFAMILY", 6410 "description": "BARCODEFAMILY as VaRank tool", 6411 "available": True, 6412 "function_name": "calculation_barcode_family", 6413 "function_params": ["BCF"], 6414 }, 6415 "TRIO": { 6416 "type": "python", 6417 "name": "TRIO", 6418 "description": "Inheritance for a trio family", 6419 "available": True, 6420 "function_name": "calculation_trio", 6421 "function_params": [], 6422 }, 6423 "VAF": { 6424 "type": "python", 6425 "name": "VAF", 6426 "description": "Variant Allele Frequency (VAF) harmonization", 6427 "available": True, 6428 "function_name": "calculation_vaf_normalization", 6429 "function_params": [], 6430 }, 6431 "VAF_stats": { 6432 "type": "python", 6433 "name": "VAF_stats", 6434 "description": "Variant Allele Frequency (VAF) statistics", 6435 "available": True, 6436 "function_name": "calculation_genotype_stats", 6437 "function_params": ["VAF"], 6438 }, 6439 "DP_stats": { 6440 "type": "python", 6441 "name": "DP_stats", 6442 "description": "Depth (DP) statistics", 6443 "available": True, 6444 "function_name": "calculation_genotype_stats", 6445 "function_params": ["DP"], 6446 }, 6447 "variant_id": { 6448 "type": "python", 6449 "name": "variant_id", 6450 "description": "Variant ID generated from variant position and type", 6451 "available": True, 6452 "function_name": "calculation_variant_id", 6453 "function_params": [], 6454 }, 6455 "transcripts_json": { 6456 "type": "python", 6457 "name": "transcripts_json", 6458 "description": "Add transcripts info in JSON format (field 'transcripts_json')", 6459 "available": True, 6460 "function_name": "calculation_transcripts_json", 6461 "function_params": ["transcripts_json"], 6462 }, 6463 "transcripts_prioritization": { 6464 "type": "python", 6465 "name": "transcripts_prioritization", 6466 "description": "Prioritize transcripts with a prioritization profile (using param.json)", 6467 "available": True, 6468 "function_name": "calculation_transcripts_prioritization", 6469 "function_params": [], 6470 }, 6471 }, 6472 "prioritizations": { 6473 "default": { 6474 "filter": [ 6475 { 6476 "type": "notequals", 6477 "value": "!PASS|\\.", 6478 "score": 0, 6479 "flag": "FILTERED", 6480 "comment": ["Bad variant quality"], 6481 }, 6482 { 6483 "type": "equals", 6484 "value": "REJECT", 6485 "score": -20, 6486 "flag": "PASS", 6487 "comment": ["Bad variant quality"], 6488 }, 6489 ], 6490 "DP": [ 6491 { 6492 "type": "gte", 6493 "value": "50", 6494 "score": 5, 6495 "flag": "PASS", 6496 "comment": ["DP higher than 50"], 6497 } 6498 ], 6499 "ANN": [ 6500 { 6501 "type": "contains", 6502 "value": "HIGH", 6503 "score": 5, 6504 "flag": "PASS", 6505 "comment": [ 6506 "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay" 6507 ], 6508 }, 6509 { 6510 "type": "contains", 6511 "value": "MODERATE", 6512 "score": 3, 6513 "flag": "PASS", 6514 "comment": [ 6515 "A non-disruptive variant that might change protein effectiveness" 6516 ], 6517 }, 6518 { 6519 "type": "contains", 6520 "value": "LOW", 6521 "score": 0, 6522 "flag": "FILTERED", 6523 "comment": [ 6524 "Assumed to be mostly harmless or unlikely to change protein behavior" 6525 ], 6526 }, 6527 { 6528 "type": "contains", 6529 "value": "MODIFIER", 6530 "score": 0, 6531 "flag": "FILTERED", 6532 "comment": [ 6533 "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact" 6534 ], 6535 }, 6536 ], 6537 } 6538 }, 6539 } 6540 6541 return config_default.get(name, None) 6542 6543 def get_config_json( 6544 self, name: str, config_dict: dict = {}, config_file: str = None 6545 ) -> dict: 6546 """ 6547 The function `get_config_json` retrieves a configuration JSON object with prioritizations from 6548 default values, a dictionary, and a file. 6549 6550 :param name: The `name` parameter in the `get_config_json` function is a string that represents 6551 the name of the configuration. It is used to identify and retrieve the configuration settings 6552 for a specific component or module 6553 :type name: str 6554 :param config_dict: The `config_dict` parameter in the `get_config_json` function is a 6555 dictionary that allows you to provide additional configuration settings or overrides. When you 6556 call the `get_config_json` function, you can pass a dictionary containing key-value pairs where 6557 the key is the configuration setting you want to override or 6558 :type config_dict: dict 6559 :param config_file: The `config_file` parameter in the `get_config_json` function is used to 6560 specify the path to a configuration file that contains additional settings. If provided, the 6561 function will read the contents of this file and update the configuration dictionary with the 6562 values found in the file, overriding any existing values with the 6563 :type config_file: str 6564 :return: The function `get_config_json` returns a dictionary containing the configuration 6565 settings. 6566 """ 6567 6568 # Create with default prioritizations 6569 config_default = self.get_config_default(name=name) 6570 configuration = config_default 6571 # log.debug(f"configuration={configuration}") 6572 6573 # Replace prioritizations from dict 6574 for config in config_dict: 6575 configuration[config] = config_dict[config] 6576 6577 # Replace prioritizations from file 6578 config_file = full_path(config_file) 6579 if config_file: 6580 if os.path.exists(config_file): 6581 with open(config_file) as config_file_content: 6582 config_file_dict = json.load(config_file_content) 6583 for config in config_file_dict: 6584 configuration[config] = config_file_dict[config] 6585 else: 6586 msg_error = f"Config '{name}' file '{config_file}' does NOT exist" 6587 log.error(msg_error) 6588 raise ValueError(msg_error) 6589 6590 return configuration 6591 6592 def prioritization( 6593 self, table: str = None, pz_prefix: str = None, pz_param: dict = None 6594 ) -> bool: 6595 """ 6596 The `prioritization` function in Python processes VCF files, adds new INFO fields, and 6597 prioritizes variants based on configured profiles and criteria. 6598 6599 :param table: The `table` parameter in the `prioritization` function is used to specify the name 6600 of the table (presumably a VCF file) on which the prioritization operation will be performed. If 6601 a table name is provided, the method will prioritize the variants in that specific table 6602 :type table: str 6603 :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to 6604 certain INFO fields in a VCF file during the prioritization process. If this parameter is not 6605 provided, the code will use a default prefix value of "PZ" 6606 :type pz_prefix: str 6607 :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass 6608 additional parameters specific to the prioritization process. These parameters can include 6609 settings related to prioritization profiles, fields, scoring modes, flags, comments, and other 6610 configurations needed for the prioritization of variants in a V 6611 :type pz_param: dict 6612 :return: A boolean value (True) is being returned from the `prioritization` function. 6613 """ 6614 6615 # Config 6616 config = self.get_config() 6617 6618 # Param 6619 param = self.get_param() 6620 6621 # Prioritization param 6622 if pz_param is not None: 6623 prioritization_param = pz_param 6624 else: 6625 prioritization_param = param.get("prioritization", {}) 6626 6627 # Configuration profiles 6628 prioritization_config_file = prioritization_param.get( 6629 "prioritization_config", None 6630 ) 6631 prioritization_config_file = full_path(prioritization_config_file) 6632 prioritizations_config = self.get_config_json( 6633 name="prioritizations", config_file=prioritization_config_file 6634 ) 6635 6636 # Prioritization prefix 6637 pz_prefix_default = "PZ" 6638 if pz_prefix is None: 6639 pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default) 6640 6641 # Prioritization options 6642 profiles = prioritization_param.get("profiles", []) 6643 if isinstance(profiles, str): 6644 profiles = profiles.split(",") 6645 pzfields = prioritization_param.get( 6646 "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"] 6647 ) 6648 if isinstance(pzfields, str): 6649 pzfields = pzfields.split(",") 6650 default_profile = prioritization_param.get("default_profile", None) 6651 pzfields_sep = prioritization_param.get("pzfields_sep", "_") 6652 prioritization_score_mode = prioritization_param.get( 6653 "prioritization_score_mode", "HOWARD" 6654 ) 6655 6656 # Quick Prioritizations 6657 prioritizations = param.get("prioritizations", None) 6658 if prioritizations: 6659 log.info("Quick Prioritization:") 6660 for profile in prioritizations.split(","): 6661 if profile not in profiles: 6662 profiles.append(profile) 6663 log.info(f" {profile}") 6664 6665 # If profile "ALL" provided, all profiles in the config profiles 6666 if "ALL" in profiles: 6667 profiles = list(prioritizations_config.keys()) 6668 6669 for profile in profiles: 6670 if prioritizations_config.get(profile, None): 6671 log.debug(f"Profile '{profile}' configured") 6672 else: 6673 msg_error = f"Profile '{profile}' NOT configured" 6674 log.error(msg_error) 6675 raise ValueError(msg_error) 6676 6677 if profiles: 6678 log.info(f"Prioritization... ") 6679 else: 6680 log.debug(f"No profile defined") 6681 return False 6682 6683 if not default_profile and len(profiles): 6684 default_profile = profiles[0] 6685 6686 log.debug("Profiles availables: " + str(list(prioritizations_config.keys()))) 6687 log.debug("Profiles to check: " + str(list(profiles))) 6688 6689 # Variables 6690 if table is not None: 6691 table_variants = table 6692 else: 6693 table_variants = self.get_table_variants(clause="update") 6694 log.debug(f"Table to prioritize: {table_variants}") 6695 6696 # Added columns 6697 added_columns = [] 6698 6699 # Create list of PZfields 6700 # List of PZFields 6701 list_of_pzfields_original = pzfields + [ 6702 pzfield + pzfields_sep + profile 6703 for pzfield in pzfields 6704 for profile in profiles 6705 ] 6706 list_of_pzfields = [] 6707 log.debug(f"{list_of_pzfields_original}") 6708 6709 # Remove existing PZfields to use if exists 6710 for pzfield in list_of_pzfields_original: 6711 if self.get_header().infos.get(pzfield, None) is None: 6712 list_of_pzfields.append(pzfield) 6713 log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF") 6714 else: 6715 log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF") 6716 6717 if list_of_pzfields: 6718 6719 # Explode Infos prefix 6720 explode_infos_prefix = self.get_explode_infos_prefix() 6721 6722 # PZfields tags description 6723 PZfields_INFOS = { 6724 f"{pz_prefix}Tags": { 6725 "ID": f"{pz_prefix}Tags", 6726 "Number": ".", 6727 "Type": "String", 6728 "Description": "Variant tags based on annotation criteria", 6729 }, 6730 f"{pz_prefix}Score": { 6731 "ID": f"{pz_prefix}Score", 6732 "Number": 1, 6733 "Type": "Integer", 6734 "Description": "Variant score based on annotation criteria", 6735 }, 6736 f"{pz_prefix}Flag": { 6737 "ID": f"{pz_prefix}Flag", 6738 "Number": 1, 6739 "Type": "String", 6740 "Description": "Variant flag based on annotation criteria", 6741 }, 6742 f"{pz_prefix}Comment": { 6743 "ID": f"{pz_prefix}Comment", 6744 "Number": ".", 6745 "Type": "String", 6746 "Description": "Variant comment based on annotation criteria", 6747 }, 6748 f"{pz_prefix}Infos": { 6749 "ID": f"{pz_prefix}Infos", 6750 "Number": ".", 6751 "Type": "String", 6752 "Description": "Variant infos based on annotation criteria", 6753 }, 6754 } 6755 6756 # Create INFO fields if not exist 6757 for field in PZfields_INFOS: 6758 field_ID = PZfields_INFOS[field]["ID"] 6759 field_description = PZfields_INFOS[field]["Description"] 6760 if field_ID not in self.get_header().infos and field_ID in pzfields: 6761 field_description = ( 6762 PZfields_INFOS[field]["Description"] 6763 + f", profile {default_profile}" 6764 ) 6765 self.get_header().infos[field_ID] = vcf.parser._Info( 6766 field_ID, 6767 PZfields_INFOS[field]["Number"], 6768 PZfields_INFOS[field]["Type"], 6769 field_description, 6770 "unknown", 6771 "unknown", 6772 code_type_map[PZfields_INFOS[field]["Type"]], 6773 ) 6774 6775 # Create INFO fields if not exist for each profile 6776 for profile in prioritizations_config: 6777 if profile in profiles or profiles == []: 6778 for field in PZfields_INFOS: 6779 field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile 6780 field_description = ( 6781 PZfields_INFOS[field]["Description"] 6782 + f", profile {profile}" 6783 ) 6784 if ( 6785 field_ID not in self.get_header().infos 6786 and field in pzfields 6787 ): 6788 self.get_header().infos[field_ID] = vcf.parser._Info( 6789 field_ID, 6790 PZfields_INFOS[field]["Number"], 6791 PZfields_INFOS[field]["Type"], 6792 field_description, 6793 "unknown", 6794 "unknown", 6795 code_type_map[PZfields_INFOS[field]["Type"]], 6796 ) 6797 6798 # Header 6799 for pzfield in list_of_pzfields: 6800 if re.match(f"{pz_prefix}Score.*", pzfield): 6801 added_column = self.add_column( 6802 table_name=table_variants, 6803 column_name=pzfield, 6804 column_type="INTEGER", 6805 default_value="0", 6806 ) 6807 elif re.match(f"{pz_prefix}Flag.*", pzfield): 6808 added_column = self.add_column( 6809 table_name=table_variants, 6810 column_name=pzfield, 6811 column_type="BOOLEAN", 6812 default_value="1", 6813 ) 6814 else: 6815 added_column = self.add_column( 6816 table_name=table_variants, 6817 column_name=pzfield, 6818 column_type="STRING", 6819 default_value="''", 6820 ) 6821 added_columns.append(added_column) 6822 6823 # Profiles 6824 if profiles: 6825 6826 # foreach profile in configuration file 6827 for profile in prioritizations_config: 6828 6829 # If profile is asked in param, or ALL are asked (empty profile []) 6830 if profile in profiles or profiles == []: 6831 log.info(f"Profile '{profile}'") 6832 6833 sql_set_info_option = "" 6834 6835 sql_set_info = [] 6836 6837 # PZ fields set 6838 6839 # PZScore 6840 if ( 6841 f"{pz_prefix}Score{pzfields_sep}{profile}" 6842 in list_of_pzfields 6843 ): 6844 sql_set_info.append( 6845 f""" 6846 concat( 6847 '{pz_prefix}Score{pzfields_sep}{profile}=', 6848 {pz_prefix}Score{pzfields_sep}{profile} 6849 ) 6850 """ 6851 ) 6852 if ( 6853 profile == default_profile 6854 and f"{pz_prefix}Score" in list_of_pzfields 6855 ): 6856 sql_set_info.append( 6857 f""" 6858 concat( 6859 '{pz_prefix}Score=', 6860 {pz_prefix}Score{pzfields_sep}{profile} 6861 ) 6862 """ 6863 ) 6864 6865 # PZFlag 6866 if ( 6867 f"{pz_prefix}Flag{pzfields_sep}{profile}" 6868 in list_of_pzfields 6869 ): 6870 sql_set_info.append( 6871 f""" 6872 concat( 6873 '{pz_prefix}Flag{pzfields_sep}{profile}=', 6874 CASE 6875 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 6876 THEN 'PASS' 6877 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 6878 THEN 'FILTERED' 6879 END 6880 ) 6881 """ 6882 ) 6883 if ( 6884 profile == default_profile 6885 and f"{pz_prefix}Flag" in list_of_pzfields 6886 ): 6887 sql_set_info.append( 6888 f""" 6889 concat( 6890 '{pz_prefix}Flag=', 6891 CASE 6892 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 6893 THEN 'PASS' 6894 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 6895 THEN 'FILTERED' 6896 END 6897 ) 6898 """ 6899 ) 6900 6901 # PZComment 6902 if ( 6903 f"{pz_prefix}Comment{pzfields_sep}{profile}" 6904 in list_of_pzfields 6905 ): 6906 sql_set_info.append( 6907 f""" 6908 CASE 6909 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 6910 THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile}) 6911 ELSE '' 6912 END 6913 """ 6914 ) 6915 if ( 6916 profile == default_profile 6917 and f"{pz_prefix}Comment" in list_of_pzfields 6918 ): 6919 sql_set_info.append( 6920 f""" 6921 CASE 6922 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 6923 THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile}) 6924 ELSE '' 6925 END 6926 """ 6927 ) 6928 6929 # PZInfos 6930 if ( 6931 f"{pz_prefix}Infos{pzfields_sep}{profile}" 6932 in list_of_pzfields 6933 ): 6934 sql_set_info.append( 6935 f""" 6936 CASE 6937 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 6938 THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile}) 6939 ELSE '' 6940 END 6941 """ 6942 ) 6943 if ( 6944 profile == default_profile 6945 and f"{pz_prefix}Infos" in list_of_pzfields 6946 ): 6947 sql_set_info.append( 6948 f""" 6949 CASE 6950 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 6951 THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile}) 6952 ELSE '' 6953 END 6954 """ 6955 ) 6956 6957 # Merge PZfields 6958 sql_set_info_option = "" 6959 sql_set_sep = "" 6960 for sql_set in sql_set_info: 6961 if sql_set_sep: 6962 sql_set_info_option += f""" 6963 , concat('{sql_set_sep}', {sql_set}) 6964 """ 6965 else: 6966 sql_set_info_option += f""" 6967 , {sql_set} 6968 """ 6969 sql_set_sep = ";" 6970 6971 sql_queries = [] 6972 for annotation in prioritizations_config[profile]: 6973 6974 # Explode specific annotation 6975 log.debug(f"Explode annotation '{annotation}'") 6976 added_columns += self.explode_infos( 6977 prefix=explode_infos_prefix, 6978 fields=[annotation], 6979 table=table_variants, 6980 ) 6981 extra_infos = self.get_extra_infos(table=table_variants) 6982 6983 # Check if annotation field is present 6984 if not f"{explode_infos_prefix}{annotation}" in extra_infos: 6985 log.debug(f"Annotation '{annotation}' not in data") 6986 continue 6987 else: 6988 log.debug(f"Annotation '{annotation}' in data") 6989 6990 # For each criterions 6991 for criterion in prioritizations_config[profile][ 6992 annotation 6993 ]: 6994 criterion_type = criterion["type"] 6995 criterion_value = criterion["value"] 6996 criterion_score = criterion.get("score", 0) 6997 criterion_flag = criterion.get("flag", "PASS") 6998 criterion_flag_bool = criterion_flag == "PASS" 6999 criterion_comment = ( 7000 ", ".join(criterion.get("comment", [])) 7001 .replace("'", "''") 7002 .replace(";", ",") 7003 .replace("\t", " ") 7004 ) 7005 criterion_infos = ( 7006 str(criterion) 7007 .replace("'", "''") 7008 .replace(";", ",") 7009 .replace("\t", " ") 7010 ) 7011 7012 sql_set = [] 7013 sql_set_info = [] 7014 7015 # PZ fields set 7016 if ( 7017 f"{pz_prefix}Score{pzfields_sep}{profile}" 7018 in list_of_pzfields 7019 ): 7020 if prioritization_score_mode == "HOWARD": 7021 sql_set.append( 7022 f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7023 ) 7024 elif prioritization_score_mode == "VaRank": 7025 sql_set.append( 7026 f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} END" 7027 ) 7028 else: 7029 sql_set.append( 7030 f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7031 ) 7032 if ( 7033 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7034 in list_of_pzfields 7035 ): 7036 sql_set.append( 7037 f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}" 7038 ) 7039 if ( 7040 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7041 in list_of_pzfields 7042 ): 7043 sql_set.append( 7044 f""" 7045 {pz_prefix}Comment{pzfields_sep}{profile} = 7046 concat( 7047 {pz_prefix}Comment{pzfields_sep}{profile}, 7048 CASE 7049 WHEN {pz_prefix}Comment{pzfields_sep}{profile}!='' 7050 THEN ', ' 7051 ELSE '' 7052 END, 7053 '{criterion_comment}' 7054 ) 7055 """ 7056 ) 7057 if ( 7058 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7059 in list_of_pzfields 7060 ): 7061 sql_set.append( 7062 f""" 7063 {pz_prefix}Infos{pzfields_sep}{profile} = 7064 concat( 7065 {pz_prefix}Infos{pzfields_sep}{profile}, 7066 '{criterion_infos}' 7067 ) 7068 """ 7069 ) 7070 sql_set_option = ",".join(sql_set) 7071 7072 # Criterion and comparison 7073 if sql_set_option: 7074 try: 7075 float(criterion_value) 7076 sql_update = f""" 7077 UPDATE {table_variants} 7078 SET {sql_set_option} 7079 WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.') 7080 AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value} 7081 """ 7082 except: 7083 contains_option = "" 7084 if criterion_type == "contains": 7085 contains_option = ".*" 7086 sql_update = f""" 7087 UPDATE {table_variants} 7088 SET {sql_set_option} 7089 WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}' 7090 """ 7091 sql_queries.append(sql_update) 7092 else: 7093 log.warning( 7094 f"NO SQL SET option for '{annotation}' - '{criterion}'" 7095 ) 7096 7097 # PZTags 7098 if ( 7099 f"{pz_prefix}Tags{pzfields_sep}{profile}" 7100 in list_of_pzfields 7101 ): 7102 7103 # Create PZFalgs value 7104 pztags_value = "" 7105 pztags_sep_default = "|" 7106 pztags_sep = "" 7107 for pzfield in pzfields: 7108 if pzfield not in [f"{pz_prefix}Tags"]: 7109 if ( 7110 f"{pzfield}{pzfields_sep}{profile}" 7111 in list_of_pzfields 7112 ): 7113 if pzfield in [f"{pz_prefix}Flag"]: 7114 pztags_value += f"""{pztags_sep}{pzfield}#', 7115 CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile} 7116 THEN 'PASS' 7117 ELSE 'FILTERED' 7118 END, '""" 7119 else: 7120 pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '" 7121 pztags_sep = pztags_sep_default 7122 7123 # Add Query update for PZFlags 7124 sql_update_pztags = f""" 7125 UPDATE {table_variants} 7126 SET INFO = concat( 7127 INFO, 7128 CASE WHEN INFO NOT in ('','.') 7129 THEN ';' 7130 ELSE '' 7131 END, 7132 '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}' 7133 ) 7134 """ 7135 sql_queries.append(sql_update_pztags) 7136 7137 # Add Query update for PZFlags for default 7138 if profile == default_profile: 7139 sql_update_pztags_default = f""" 7140 UPDATE {table_variants} 7141 SET INFO = concat( 7142 INFO, 7143 ';', 7144 '{pz_prefix}Tags={pztags_value}' 7145 ) 7146 """ 7147 sql_queries.append(sql_update_pztags_default) 7148 7149 log.info(f"""Profile '{profile}' - Prioritization... """) 7150 7151 if sql_queries: 7152 7153 for sql_query in sql_queries: 7154 log.debug( 7155 f"""Profile '{profile}' - Prioritization query: {sql_query}... """ 7156 ) 7157 self.conn.execute(sql_query) 7158 7159 log.info(f"""Profile '{profile}' - Update... """) 7160 sql_query_update = f""" 7161 UPDATE {table_variants} 7162 SET INFO = 7163 concat( 7164 CASE 7165 WHEN INFO NOT IN ('','.') 7166 THEN concat(INFO, ';') 7167 ELSE '' 7168 END 7169 {sql_set_info_option} 7170 ) 7171 """ 7172 self.conn.execute(sql_query_update) 7173 7174 else: 7175 7176 log.warning(f"No profiles in parameters") 7177 7178 # Remove added columns 7179 for added_column in added_columns: 7180 self.drop_column(column=added_column) 7181 7182 # Explode INFOS fields into table fields 7183 if self.get_explode_infos(): 7184 self.explode_infos( 7185 prefix=self.get_explode_infos_prefix(), 7186 fields=self.get_explode_infos_fields(), 7187 force=True, 7188 ) 7189 7190 return True 7191 7192 ### 7193 # HGVS 7194 ### 7195 7196 def annotation_hgvs(self, threads: int = None) -> None: 7197 """ 7198 The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic 7199 coordinates and alleles. 7200 7201 :param threads: The `threads` parameter is an optional integer that specifies the number of 7202 threads to use for parallel processing. If no value is provided, it will default to the number 7203 of threads obtained from the `get_threads()` method 7204 :type threads: int 7205 """ 7206 7207 # Function for each partition of the Dask Dataframe 7208 def partition_function(partition): 7209 """ 7210 The function `partition_function` applies the `annotation_hgvs_partition` function to 7211 each row of a DataFrame called `partition`. 7212 7213 :param partition: The parameter "partition" is a pandas DataFrame that contains the data 7214 to be processed 7215 :return: the result of applying the "annotation_hgvs_partition" function to each row of 7216 the "partition" dataframe along the axis 1. 7217 """ 7218 return partition.apply(annotation_hgvs_partition, axis=1) 7219 7220 def annotation_hgvs_partition(row) -> str: 7221 """ 7222 The function `annotation_hgvs_partition` takes in a row of data and returns a string 7223 containing a list of HGVS names associated with the given genomic coordinates and alleles. 7224 7225 :param row: A dictionary-like object that contains the values for the following keys: 7226 :return: a string that contains the HGVS names associated with the given row of data. 7227 """ 7228 7229 chr = row["CHROM"] 7230 pos = row["POS"] 7231 ref = row["REF"] 7232 alt = row["ALT"] 7233 7234 # Find list of associated transcripts 7235 transcripts_list = list( 7236 polars_conn.execute( 7237 f""" 7238 SELECT transcript 7239 FROM refseq_df 7240 WHERE CHROM='{chr}' 7241 AND POS={pos} 7242 """ 7243 )["transcript"] 7244 ) 7245 7246 # Full HGVS annotation in list 7247 hgvs_full_list = [] 7248 7249 for transcript_name in transcripts_list: 7250 7251 # Transcript 7252 transcript = get_transcript( 7253 transcripts=transcripts, transcript_name=transcript_name 7254 ) 7255 # Exon 7256 if use_exon: 7257 exon = transcript.find_exon_number(pos) 7258 else: 7259 exon = None 7260 # Protein 7261 transcript_protein = None 7262 if use_protein or add_protein or full_format: 7263 transcripts_protein = list( 7264 polars_conn.execute( 7265 f""" 7266 SELECT protein 7267 FROM refseqlink_df 7268 WHERE transcript='{transcript_name}' 7269 LIMIT 1 7270 """ 7271 )["protein"] 7272 ) 7273 if len(transcripts_protein): 7274 transcript_protein = transcripts_protein[0] 7275 7276 # HGVS name 7277 hgvs_name = format_hgvs_name( 7278 chr, 7279 pos, 7280 ref, 7281 alt, 7282 genome=genome, 7283 transcript=transcript, 7284 transcript_protein=transcript_protein, 7285 exon=exon, 7286 use_gene=use_gene, 7287 use_protein=use_protein, 7288 full_format=full_format, 7289 use_version=use_version, 7290 codon_type=codon_type, 7291 ) 7292 hgvs_full_list.append(hgvs_name) 7293 if add_protein and not use_protein and not full_format: 7294 hgvs_name = format_hgvs_name( 7295 chr, 7296 pos, 7297 ref, 7298 alt, 7299 genome=genome, 7300 transcript=transcript, 7301 transcript_protein=transcript_protein, 7302 exon=exon, 7303 use_gene=use_gene, 7304 use_protein=True, 7305 full_format=False, 7306 use_version=use_version, 7307 codon_type=codon_type, 7308 ) 7309 hgvs_full_list.append(hgvs_name) 7310 7311 # Create liste of HGVS annotations 7312 hgvs_full = ",".join(hgvs_full_list) 7313 7314 return hgvs_full 7315 7316 # Polars connexion 7317 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7318 7319 # Config 7320 config = self.get_config() 7321 7322 # Databases 7323 # Genome 7324 databases_genomes_folders = ( 7325 config.get("folders", {}) 7326 .get("databases", {}) 7327 .get("genomes", DEFAULT_GENOME_FOLDER) 7328 ) 7329 databases_genome = ( 7330 config.get("folders", {}).get("databases", {}).get("genomes", "") 7331 ) 7332 # refseq database folder 7333 databases_refseq_folders = ( 7334 config.get("folders", {}) 7335 .get("databases", {}) 7336 .get("refseq", DEFAULT_REFSEQ_FOLDER) 7337 ) 7338 # refseq 7339 databases_refseq = config.get("databases", {}).get("refSeq", None) 7340 # refSeqLink 7341 databases_refseqlink = config.get("databases", {}).get("refSeqLink", None) 7342 7343 # Param 7344 param = self.get_param() 7345 7346 # Quick HGVS 7347 if "hgvs_options" in param and param.get("hgvs_options", ""): 7348 log.info(f"Quick HGVS Annotation:") 7349 if not param.get("hgvs", None): 7350 param["hgvs"] = {} 7351 for option in param.get("hgvs_options", "").split(","): 7352 option_var_val = option.split("=") 7353 option_var = option_var_val[0] 7354 if len(option_var_val) > 1: 7355 option_val = option_var_val[1] 7356 else: 7357 option_val = "True" 7358 if option_val.upper() in ["TRUE"]: 7359 option_val = True 7360 elif option_val.upper() in ["FALSE"]: 7361 option_val = False 7362 log.info(f" {option_var}={option_val}") 7363 param["hgvs"][option_var] = option_val 7364 7365 # Check if HGVS annotation enabled 7366 if "hgvs" in param: 7367 log.info(f"HGVS Annotation... ") 7368 for hgvs_option in param.get("hgvs", {}): 7369 log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}") 7370 else: 7371 return 7372 7373 # HGVS Param 7374 param_hgvs = param.get("hgvs", {}) 7375 use_exon = param_hgvs.get("use_exon", False) 7376 use_gene = param_hgvs.get("use_gene", False) 7377 use_protein = param_hgvs.get("use_protein", False) 7378 add_protein = param_hgvs.get("add_protein", False) 7379 full_format = param_hgvs.get("full_format", False) 7380 use_version = param_hgvs.get("use_version", False) 7381 codon_type = param_hgvs.get("codon_type", "3") 7382 7383 # refSseq refSeqLink 7384 databases_refseq = param_hgvs.get("refseq", databases_refseq) 7385 databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink) 7386 7387 # Assembly 7388 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 7389 7390 # Genome 7391 genome_file = None 7392 if find_genome(databases_genome): 7393 genome_file = find_genome(databases_genome) 7394 else: 7395 genome_file = find_genome( 7396 genome_path=databases_genomes_folders, assembly=assembly 7397 ) 7398 log.debug("Genome: " + str(genome_file)) 7399 7400 # refSseq 7401 refseq_file = find_file_prefix( 7402 input_file=databases_refseq, 7403 prefix="ncbiRefSeq", 7404 folder=databases_refseq_folders, 7405 assembly=assembly, 7406 ) 7407 log.debug("refSeq: " + str(refseq_file)) 7408 7409 # refSeqLink 7410 refseqlink_file = find_file_prefix( 7411 input_file=databases_refseqlink, 7412 prefix="ncbiRefSeqLink", 7413 folder=databases_refseq_folders, 7414 assembly=assembly, 7415 ) 7416 log.debug("refSeqLink: " + str(refseqlink_file)) 7417 7418 # Threads 7419 if not threads: 7420 threads = self.get_threads() 7421 log.debug("Threads: " + str(threads)) 7422 7423 # Variables 7424 table_variants = self.get_table_variants(clause="update") 7425 7426 # Get variants SNV and InDel only 7427 query_variants = f""" 7428 SELECT "#CHROM" AS CHROM, POS, REF, ALT 7429 FROM {table_variants} 7430 WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$' 7431 """ 7432 df_variants = self.get_query_to_df(query_variants) 7433 7434 # Added columns 7435 added_columns = [] 7436 7437 # Add hgvs column in variants table 7438 hgvs_column_name = "hgvs_" + str(random.randrange(1000)) 7439 added_column = self.add_column( 7440 table_variants, hgvs_column_name, "STRING", default_value=None 7441 ) 7442 added_columns.append(added_column) 7443 7444 log.debug(f"refSeq loading...") 7445 # refSeq in duckDB 7446 refseq_table = get_refseq_table( 7447 conn=self.conn, refseq_table="refseq", refseq_file=refseq_file 7448 ) 7449 # Loading all refSeq in Dataframe 7450 refseq_query = f""" 7451 SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript 7452 FROM {refseq_table} 7453 JOIN df_variants ON ( 7454 {refseq_table}.chrom = df_variants.CHROM 7455 AND {refseq_table}.txStart<=df_variants.POS 7456 AND {refseq_table}.txEnd>=df_variants.POS 7457 ) 7458 """ 7459 refseq_df = self.conn.query(refseq_query).pl() 7460 7461 if refseqlink_file: 7462 log.debug(f"refSeqLink loading...") 7463 # refSeqLink in duckDB 7464 refseqlink_table = get_refseq_table( 7465 conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file 7466 ) 7467 # Loading all refSeqLink in Dataframe 7468 protacc_column = "protAcc_with_ver" 7469 mrnaacc_column = "mrnaAcc_with_ver" 7470 refseqlink_query = f""" 7471 SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript 7472 FROM {refseqlink_table} 7473 JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver) 7474 WHERE protAcc_without_ver IS NOT NULL 7475 """ 7476 # Polars Dataframe 7477 refseqlink_df = self.conn.query(f"{refseqlink_query}").pl() 7478 7479 # Read RefSeq transcripts into a python dict/model. 7480 log.debug(f"Transcripts loading...") 7481 with tempfile.TemporaryDirectory() as tmpdir: 7482 transcripts_query = f""" 7483 COPY ( 7484 SELECT {refseq_table}.* 7485 FROM {refseq_table} 7486 JOIN df_variants ON ( 7487 {refseq_table}.chrom=df_variants.CHROM 7488 AND {refseq_table}.txStart<=df_variants.POS 7489 AND {refseq_table}.txEnd>=df_variants.POS 7490 ) 7491 ) 7492 TO '{tmpdir}/transcript.tsv' (DELIMITER '\t'); 7493 """ 7494 self.conn.query(transcripts_query) 7495 with open(f"{tmpdir}/transcript.tsv") as infile: 7496 transcripts = read_transcripts(infile) 7497 7498 # Polars connexion 7499 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7500 7501 log.debug("Genome loading...") 7502 # Read genome sequence using pyfaidx. 7503 genome = Fasta(genome_file) 7504 7505 log.debug("Start annotation HGVS...") 7506 7507 # Create 7508 # a Dask Dataframe from Pandas dataframe with partition as number of threads 7509 ddf = dd.from_pandas(df_variants, npartitions=threads) 7510 7511 # Use dask.dataframe.apply() to apply function on each partition 7512 ddf[hgvs_column_name] = ddf.map_partitions(partition_function) 7513 7514 # Convert Dask DataFrame to Pandas Dataframe 7515 df = ddf.compute() 7516 7517 # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???) 7518 with tempfile.TemporaryDirectory() as tmpdir: 7519 df_parquet = os.path.join(tmpdir, "df.parquet") 7520 df.to_parquet(df_parquet) 7521 7522 # Update hgvs column 7523 update_variant_query = f""" 7524 UPDATE {table_variants} 7525 SET "{hgvs_column_name}"=df."{hgvs_column_name}" 7526 FROM read_parquet('{df_parquet}') as df 7527 WHERE variants."#CHROM" = df.CHROM 7528 AND variants.POS = df.POS 7529 AND variants.REF = df.REF 7530 AND variants.ALT = df.ALT 7531 AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL 7532 """ 7533 self.execute_query(update_variant_query) 7534 7535 # Update INFO column 7536 sql_query_update = f""" 7537 UPDATE {table_variants} 7538 SET INFO = 7539 concat( 7540 CASE 7541 WHEN INFO NOT IN ('','.') 7542 THEN concat(INFO, ';') 7543 ELSE '' 7544 END, 7545 'hgvs=', 7546 {hgvs_column_name} 7547 ) 7548 WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL 7549 """ 7550 self.execute_query(sql_query_update) 7551 7552 # Add header 7553 HGVS_INFOS = { 7554 "hgvs": { 7555 "ID": "hgvs", 7556 "Number": ".", 7557 "Type": "String", 7558 "Description": f"HGVS annotatation with HOWARD", 7559 } 7560 } 7561 7562 for field in HGVS_INFOS: 7563 field_ID = HGVS_INFOS[field]["ID"] 7564 field_description = HGVS_INFOS[field]["Description"] 7565 self.get_header().infos[field_ID] = vcf.parser._Info( 7566 field_ID, 7567 HGVS_INFOS[field]["Number"], 7568 HGVS_INFOS[field]["Type"], 7569 field_description, 7570 "unknown", 7571 "unknown", 7572 code_type_map[HGVS_INFOS[field]["Type"]], 7573 ) 7574 7575 # Remove added columns 7576 for added_column in added_columns: 7577 self.drop_column(column=added_column) 7578 7579 ### 7580 # Calculation 7581 ### 7582 7583 def get_operations_help( 7584 self, operations_config_dict: dict = {}, operations_config_file: str = None 7585 ) -> list: 7586 7587 # Init 7588 operations_help = [] 7589 7590 # operations 7591 operations = self.get_config_json( 7592 name="calculations", 7593 config_dict=operations_config_dict, 7594 config_file=operations_config_file, 7595 ) 7596 for op in operations: 7597 op_name = operations[op].get("name", op).upper() 7598 op_description = operations[op].get("description", op_name) 7599 op_available = operations[op].get("available", False) 7600 if op_available: 7601 operations_help.append(f" {op_name}: {op_description}") 7602 7603 # Sort operations 7604 operations_help.sort() 7605 7606 # insert header 7607 operations_help.insert(0, "Available calculation operations:") 7608 7609 # Return 7610 return operations_help 7611 7612 def calculation( 7613 self, 7614 operations: dict = {}, 7615 operations_config_dict: dict = {}, 7616 operations_config_file: str = None, 7617 ) -> None: 7618 """ 7619 It takes a list of operations, and for each operation, it checks if it's a python or sql 7620 operation, and then calls the appropriate function 7621 7622 param json example: 7623 "calculation": { 7624 "NOMEN": { 7625 "options": { 7626 "hgvs_field": "hgvs" 7627 }, 7628 "middle" : null 7629 } 7630 """ 7631 7632 # Param 7633 param = self.get_param() 7634 7635 # operations config 7636 operations_config = self.get_config_json( 7637 name="calculations", 7638 config_dict=operations_config_dict, 7639 config_file=operations_config_file, 7640 ) 7641 7642 # Upper keys 7643 operations_config = {k.upper(): v for k, v in operations_config.items()} 7644 7645 # Calculations 7646 7647 # Operations from param 7648 operations = param.get("calculation", {}).get("calculations", operations) 7649 7650 # Quick calculation - add 7651 if param.get("calculations", None): 7652 calculations_list = [ 7653 value for value in param.get("calculations", "").split(",") 7654 ] 7655 log.info(f"Quick Calculations:") 7656 for calculation_key in calculations_list: 7657 log.info(f" {calculation_key}") 7658 for calculation_operation in calculations_list: 7659 if calculation_operation.upper() not in operations: 7660 operations[calculation_operation.upper()] = {} 7661 add_value_into_dict( 7662 dict_tree=param, 7663 sections=[ 7664 "calculation", 7665 "calculations", 7666 calculation_operation.upper(), 7667 ], 7668 value={}, 7669 ) 7670 7671 # Operations for calculation 7672 if not operations: 7673 operations = param.get("calculation", {}).get("calculations", {}) 7674 7675 if operations: 7676 log.info(f"Calculations...") 7677 7678 # For each operations 7679 for operation_name in operations: 7680 operation_name = operation_name.upper() 7681 if operation_name not in [""]: 7682 if operation_name in operations_config: 7683 log.info(f"Calculation '{operation_name}'") 7684 operation = operations_config[operation_name] 7685 operation_type = operation.get("type", "sql") 7686 if operation_type == "python": 7687 self.calculation_process_function( 7688 operation=operation, operation_name=operation_name 7689 ) 7690 elif operation_type == "sql": 7691 self.calculation_process_sql( 7692 operation=operation, operation_name=operation_name 7693 ) 7694 else: 7695 log.error( 7696 f"Operations config: Type '{operation_type}' NOT available" 7697 ) 7698 raise ValueError( 7699 f"Operations config: Type '{operation_type}' NOT available" 7700 ) 7701 else: 7702 log.error( 7703 f"Operations config: Calculation '{operation_name}' NOT available" 7704 ) 7705 raise ValueError( 7706 f"Operations config: Calculation '{operation_name}' NOT available" 7707 ) 7708 7709 # Explode INFOS fields into table fields 7710 if self.get_explode_infos(): 7711 self.explode_infos( 7712 prefix=self.get_explode_infos_prefix(), 7713 fields=self.get_explode_infos_fields(), 7714 force=True, 7715 ) 7716 7717 def calculation_process_sql( 7718 self, operation: dict, operation_name: str = "unknown" 7719 ) -> None: 7720 """ 7721 The `calculation_process_sql` function takes in a mathematical operation as a string and 7722 performs the operation, updating the specified table with the result. 7723 7724 :param operation: The `operation` parameter is a dictionary that contains information about the 7725 mathematical operation to be performed. It includes the following keys: 7726 :type operation: dict 7727 :param operation_name: The `operation_name` parameter is a string that represents the name of 7728 the mathematical operation being performed. It is used for logging and error handling purposes, 7729 defaults to unknown 7730 :type operation_name: str (optional) 7731 """ 7732 7733 # table variants 7734 table_variants = self.get_table_variants(clause="alter") 7735 7736 # Operation infos 7737 operation_name = operation.get("name", "unknown") 7738 log.debug(f"process sql {operation_name}") 7739 output_column_name = operation.get("output_column_name", operation_name) 7740 output_column_type = operation.get("output_column_type", "String") 7741 prefix = operation.get("explode_infos_prefix", "") 7742 output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR") 7743 output_column_description = operation.get( 7744 "output_column_description", f"{operation_name} operation" 7745 ) 7746 operation_query = operation.get("operation_query", None) 7747 if isinstance(operation_query, list): 7748 operation_query = " ".join(operation_query) 7749 operation_info_fields = operation.get("info_fields", []) 7750 operation_info_fields_check = operation.get("info_fields_check", False) 7751 operation_info = operation.get("operation_info", True) 7752 7753 if operation_query: 7754 7755 # Info fields check 7756 operation_info_fields_check_result = True 7757 if operation_info_fields_check: 7758 header_infos = self.get_header().infos 7759 for info_field in operation_info_fields: 7760 operation_info_fields_check_result = ( 7761 operation_info_fields_check_result 7762 and info_field in header_infos 7763 ) 7764 7765 # If info fields available 7766 if operation_info_fields_check_result: 7767 7768 # Added_columns 7769 added_columns = [] 7770 7771 # Create VCF header field 7772 vcf_reader = self.get_header() 7773 vcf_reader.infos[output_column_name] = vcf.parser._Info( 7774 output_column_name, 7775 ".", 7776 output_column_type, 7777 output_column_description, 7778 "howard calculation", 7779 "0", 7780 self.code_type_map.get(output_column_type), 7781 ) 7782 7783 # Explode infos if needed 7784 log.debug(f"calculation_process_sql prefix {prefix}") 7785 added_columns += self.explode_infos( 7786 prefix=prefix, 7787 fields=[output_column_name] + operation_info_fields, 7788 force=True, 7789 ) 7790 7791 # Create column 7792 added_column = self.add_column( 7793 table_name=table_variants, 7794 column_name=prefix + output_column_name, 7795 column_type=output_column_type_sql, 7796 default_value="null", 7797 ) 7798 added_columns.append(added_column) 7799 7800 # Operation calculation 7801 try: 7802 7803 # Query to update calculation column 7804 sql_update = f""" 7805 UPDATE {table_variants} 7806 SET "{prefix}{output_column_name}" = ({operation_query}) 7807 """ 7808 self.conn.execute(sql_update) 7809 7810 # Add to INFO 7811 if operation_info: 7812 sql_update_info = f""" 7813 UPDATE {table_variants} 7814 SET "INFO" = 7815 concat( 7816 CASE 7817 WHEN "INFO" IS NOT NULL 7818 THEN concat("INFO", ';') 7819 ELSE '' 7820 END, 7821 '{output_column_name}=', 7822 "{prefix}{output_column_name}" 7823 ) 7824 WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('') 7825 """ 7826 self.conn.execute(sql_update_info) 7827 7828 except: 7829 log.error( 7830 f"Operations config: Calculation '{operation_name}' query failed" 7831 ) 7832 raise ValueError( 7833 f"Operations config: Calculation '{operation_name}' query failed" 7834 ) 7835 7836 # Remove added columns 7837 for added_column in added_columns: 7838 log.debug(f"added_column: {added_column}") 7839 self.drop_column(column=added_column) 7840 7841 else: 7842 log.error( 7843 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 7844 ) 7845 raise ValueError( 7846 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 7847 ) 7848 7849 else: 7850 log.error( 7851 f"Operations config: Calculation '{operation_name}' query NOT defined" 7852 ) 7853 raise ValueError( 7854 f"Operations config: Calculation '{operation_name}' query NOT defined" 7855 ) 7856 7857 def calculation_process_function( 7858 self, operation: dict, operation_name: str = "unknown" 7859 ) -> None: 7860 """ 7861 The `calculation_process_function` takes in an operation dictionary and performs the specified 7862 function with the given parameters. 7863 7864 :param operation: The `operation` parameter is a dictionary that contains information about the 7865 operation to be performed. It has the following keys: 7866 :type operation: dict 7867 :param operation_name: The `operation_name` parameter is a string that represents the name of 7868 the operation being performed. It is used for logging purposes, defaults to unknown 7869 :type operation_name: str (optional) 7870 """ 7871 7872 operation_name = operation["name"] 7873 log.debug(f"process sql {operation_name}") 7874 function_name = operation["function_name"] 7875 function_params = operation["function_params"] 7876 getattr(self, function_name)(*function_params) 7877 7878 def calculation_variant_id(self) -> None: 7879 """ 7880 The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and 7881 updates the INFO field of a variants table with the variant ID. 7882 """ 7883 7884 # variant_id annotation field 7885 variant_id_tag = self.get_variant_id_column() 7886 added_columns = [variant_id_tag] 7887 7888 # variant_id hgvs tags" 7889 vcf_infos_tags = { 7890 variant_id_tag: "howard variant ID annotation", 7891 } 7892 7893 # Variants table 7894 table_variants = self.get_table_variants() 7895 7896 # Header 7897 vcf_reader = self.get_header() 7898 7899 # Add variant_id to header 7900 vcf_reader.infos[variant_id_tag] = vcf.parser._Info( 7901 variant_id_tag, 7902 ".", 7903 "String", 7904 vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"), 7905 "howard calculation", 7906 "0", 7907 self.code_type_map.get("String"), 7908 ) 7909 7910 # Update 7911 sql_update = f""" 7912 UPDATE {table_variants} 7913 SET "INFO" = 7914 concat( 7915 CASE 7916 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 7917 THEN '' 7918 ELSE concat("INFO", ';') 7919 END, 7920 '{variant_id_tag}=', 7921 "{variant_id_tag}" 7922 ) 7923 """ 7924 self.conn.execute(sql_update) 7925 7926 # Remove added columns 7927 for added_column in added_columns: 7928 self.drop_column(column=added_column) 7929 7930 def calculation_extract_snpeff_hgvs( 7931 self, 7932 snpeff_hgvs: str = "snpeff_hgvs", 7933 snpeff_field: str = "ANN", 7934 ) -> None: 7935 """ 7936 The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff 7937 annotation field in a VCF file and adds them as a new column in the variants table. 7938 7939 :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs` 7940 function is used to specify the name of the column that will store the HGVS nomenclatures 7941 extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to 7942 snpeff_hgvs 7943 :type snpeff_hgvs: str (optional) 7944 :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs` 7945 function represents the field in the VCF file that contains SnpEff annotations. This field is 7946 used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults 7947 to ANN 7948 :type snpeff_field: str (optional) 7949 """ 7950 7951 # Snpeff hgvs tags 7952 vcf_infos_tags = { 7953 snpeff_hgvs: "HGVS nomenclatures from snpEff annotation", 7954 } 7955 7956 # Prefix 7957 prefix = self.get_explode_infos_prefix() 7958 if prefix: 7959 prefix = "INFO/" 7960 7961 # snpEff fields 7962 speff_ann_infos = prefix + snpeff_field 7963 speff_hgvs_infos = prefix + snpeff_hgvs 7964 7965 # Variants table 7966 table_variants = self.get_table_variants() 7967 7968 # Header 7969 vcf_reader = self.get_header() 7970 7971 # Add columns 7972 added_columns = [] 7973 7974 # Explode HGVS field in column 7975 added_columns += self.explode_infos(fields=[snpeff_field]) 7976 7977 if snpeff_field in vcf_reader.infos: 7978 7979 log.debug(vcf_reader.infos[snpeff_field]) 7980 7981 # Extract ANN header 7982 ann_description = vcf_reader.infos[snpeff_field].desc 7983 pattern = r"'(.+?)'" 7984 match = re.search(pattern, ann_description) 7985 if match: 7986 ann_header_match = match.group(1).split(" | ") 7987 ann_header_desc = {} 7988 for i in range(len(ann_header_match)): 7989 ann_header_info = "".join( 7990 char for char in ann_header_match[i] if char.isalnum() 7991 ) 7992 ann_header_desc[ann_header_info] = ann_header_match[i] 7993 if not ann_header_desc: 7994 raise ValueError("Invalid header description format") 7995 else: 7996 raise ValueError("Invalid header description format") 7997 7998 # Create variant id 7999 variant_id_column = self.get_variant_id_column() 8000 added_columns += [variant_id_column] 8001 8002 # Create dataframe 8003 dataframe_snpeff_hgvs = self.get_query_to_df( 8004 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8005 ) 8006 8007 # Create main NOMEN column 8008 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8009 speff_ann_infos 8010 ].apply( 8011 lambda x: extract_snpeff_hgvs( 8012 str(x), header=list(ann_header_desc.values()) 8013 ) 8014 ) 8015 8016 # Add snpeff_hgvs to header 8017 vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info( 8018 snpeff_hgvs, 8019 ".", 8020 "String", 8021 vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"), 8022 "howard calculation", 8023 "0", 8024 self.code_type_map.get("String"), 8025 ) 8026 8027 # Update 8028 sql_update = f""" 8029 UPDATE variants 8030 SET "INFO" = 8031 concat( 8032 CASE 8033 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8034 THEN '' 8035 ELSE concat("INFO", ';') 8036 END, 8037 CASE 8038 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8039 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8040 THEN concat( 8041 '{snpeff_hgvs}=', 8042 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8043 ) 8044 ELSE '' 8045 END 8046 ) 8047 FROM dataframe_snpeff_hgvs 8048 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8049 8050 """ 8051 self.conn.execute(sql_update) 8052 8053 # Delete dataframe 8054 del dataframe_snpeff_hgvs 8055 gc.collect() 8056 8057 else: 8058 8059 log.warning( 8060 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8061 ) 8062 8063 # Remove added columns 8064 for added_column in added_columns: 8065 self.drop_column(column=added_column) 8066 8067 def calculation_snpeff_ann_explode( 8068 self, 8069 uniquify: bool = True, 8070 output_format: str = "fields", 8071 output_prefix: str = "snpeff_", 8072 snpeff_field: str = "ANN", 8073 ) -> None: 8074 """ 8075 The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by 8076 exploding the HGVS field and updating variant information accordingly. 8077 8078 :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a 8079 boolean flag that determines whether the output should be uniquified or not. When set to `True`, 8080 it indicates that the output should be unique, meaning that duplicate entries should be removed, 8081 defaults to True 8082 :type uniquify: bool (optional) 8083 :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode` 8084 function specifies the format in which the output annotations will be generated. It has a 8085 default value of "fields". You can also set it to "JSON" to output the annotations in JSON 8086 format, defaults to fields 8087 :type output_format: str (optional) 8088 :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode` 8089 method is used to specify the prefix that will be added to the output annotations generated 8090 during the calculation process. This prefix helps to differentiate the newly added annotations 8091 from existing ones in the output data. By default, the, defaults to ANN_ 8092 :type output_prefix: str (optional) 8093 :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode` 8094 function is used to specify the field in the VCF file that contains SnpEff annotations. This 8095 field will be processed to explode the HGVS annotations and update the variant information 8096 accordingly, defaults to ANN 8097 :type snpeff_field: str (optional) 8098 """ 8099 8100 # SnpEff annotation field 8101 snpeff_hgvs = "snpeff_ann_explode" 8102 8103 # Snpeff hgvs tags 8104 vcf_infos_tags = { 8105 snpeff_hgvs: "Explode snpEff annotations", 8106 } 8107 8108 # Prefix 8109 prefix = self.get_explode_infos_prefix() 8110 if prefix: 8111 prefix = "INFO/" 8112 8113 # snpEff fields 8114 speff_ann_infos = prefix + snpeff_field 8115 speff_hgvs_infos = prefix + snpeff_hgvs 8116 8117 # Variants table 8118 table_variants = self.get_table_variants() 8119 8120 # Header 8121 vcf_reader = self.get_header() 8122 8123 # Add columns 8124 added_columns = [] 8125 8126 # Explode HGVS field in column 8127 added_columns += self.explode_infos(fields=[snpeff_field]) 8128 log.debug(f"snpeff_field={snpeff_field}") 8129 log.debug(f"added_columns={added_columns}") 8130 8131 if snpeff_field in vcf_reader.infos: 8132 8133 # Extract ANN header 8134 ann_description = vcf_reader.infos[snpeff_field].desc 8135 pattern = r"'(.+?)'" 8136 match = re.search(pattern, ann_description) 8137 if match: 8138 ann_header_match = match.group(1).split(" | ") 8139 ann_header = [] 8140 ann_header_desc = {} 8141 for i in range(len(ann_header_match)): 8142 ann_header_info = "".join( 8143 char for char in ann_header_match[i] if char.isalnum() 8144 ) 8145 ann_header.append(ann_header_info) 8146 ann_header_desc[ann_header_info] = ann_header_match[i] 8147 if not ann_header_desc: 8148 raise ValueError("Invalid header description format") 8149 else: 8150 raise ValueError("Invalid header description format") 8151 8152 # Create variant id 8153 variant_id_column = self.get_variant_id_column() 8154 added_columns += [variant_id_column] 8155 8156 # Create dataframe 8157 dataframe_snpeff_hgvs = self.get_query_to_df( 8158 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8159 ) 8160 8161 # Create snpEff columns 8162 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8163 speff_ann_infos 8164 ].apply( 8165 lambda x: explode_snpeff_ann( 8166 str(x), 8167 uniquify=uniquify, 8168 output_format=output_format, 8169 prefix=output_prefix, 8170 header=list(ann_header_desc.values()), 8171 ) 8172 ) 8173 8174 # Header 8175 ann_annotations_prefix = "" 8176 if output_format.upper() in ["JSON"]: 8177 ann_annotations_prefix = f"{output_prefix}=" 8178 vcf_reader.infos[output_prefix] = vcf.parser._Info( 8179 output_prefix, 8180 ".", 8181 "String", 8182 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8183 + " - JSON format", 8184 "howard calculation", 8185 "0", 8186 self.code_type_map.get("String"), 8187 ) 8188 else: 8189 for ann_annotation in ann_header: 8190 ann_annotation_id = f"{output_prefix}{ann_annotation}" 8191 vcf_reader.infos[ann_annotation_id] = vcf.parser._Info( 8192 ann_annotation_id, 8193 ".", 8194 "String", 8195 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8196 + f" - '{ann_header_desc[ann_annotation]}' annotation", 8197 "howard calculation", 8198 "0", 8199 self.code_type_map.get("String"), 8200 ) 8201 8202 # Update 8203 sql_update = f""" 8204 UPDATE variants 8205 SET "INFO" = 8206 concat( 8207 CASE 8208 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8209 THEN '' 8210 ELSE concat("INFO", ';') 8211 END, 8212 CASE 8213 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8214 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8215 THEN concat( 8216 '{ann_annotations_prefix}', 8217 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8218 ) 8219 ELSE '' 8220 END 8221 ) 8222 FROM dataframe_snpeff_hgvs 8223 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8224 8225 """ 8226 self.conn.execute(sql_update) 8227 8228 # Delete dataframe 8229 del dataframe_snpeff_hgvs 8230 gc.collect() 8231 8232 else: 8233 8234 log.warning( 8235 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8236 ) 8237 8238 # Remove added columns 8239 for added_column in added_columns: 8240 self.drop_column(column=added_column) 8241 8242 def calculation_extract_nomen(self) -> None: 8243 """ 8244 This function extracts the HGVS nomenclature from the calculation/identification of NOMEN. 8245 """ 8246 8247 # NOMEN field 8248 field_nomen_dict = "NOMEN_DICT" 8249 8250 # NOMEN structure 8251 nomen_dict = { 8252 "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)", 8253 "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)", 8254 "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)", 8255 "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant", 8256 "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)", 8257 "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)", 8258 "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)", 8259 "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)", 8260 "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)", 8261 "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)", 8262 } 8263 8264 # Param 8265 param = self.get_param() 8266 8267 # Prefix 8268 prefix = self.get_explode_infos_prefix() 8269 8270 # Header 8271 vcf_reader = self.get_header() 8272 8273 # Get HGVS field 8274 hgvs_field = ( 8275 param.get("calculation", {}) 8276 .get("calculations", {}) 8277 .get("NOMEN", {}) 8278 .get("options", {}) 8279 .get("hgvs_field", "hgvs") 8280 ) 8281 8282 # Get transcripts 8283 transcripts_file = ( 8284 param.get("calculation", {}) 8285 .get("calculations", {}) 8286 .get("NOMEN", {}) 8287 .get("options", {}) 8288 .get("transcripts", None) 8289 ) 8290 transcripts_file = full_path(transcripts_file) 8291 transcripts = [] 8292 if transcripts_file: 8293 if os.path.exists(transcripts_file): 8294 transcripts_dataframe = transcripts_file_to_df(transcripts_file) 8295 transcripts = transcripts_dataframe.iloc[:, 0].tolist() 8296 else: 8297 log.error(f"Transcript file '{transcripts_file}' does NOT exist") 8298 raise ValueError(f"Transcript file '{transcripts_file}' does NOT exist") 8299 8300 # Added columns 8301 added_columns = [] 8302 8303 # Explode HGVS field in column 8304 added_columns += self.explode_infos(fields=[hgvs_field]) 8305 8306 # extra infos 8307 extra_infos = self.get_extra_infos() 8308 extra_field = prefix + hgvs_field 8309 8310 if extra_field in extra_infos: 8311 8312 # Create dataframe 8313 dataframe_hgvs = self.get_query_to_df( 8314 f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" FROM variants """ 8315 ) 8316 8317 # Create main NOMEN column 8318 dataframe_hgvs[field_nomen_dict] = dataframe_hgvs[extra_field].apply( 8319 lambda x: find_nomen(str(x), transcripts=transcripts) 8320 ) 8321 8322 # Explode NOMEN Structure and create SQL set for update 8323 sql_nomen_fields = [] 8324 for nomen_field in nomen_dict: 8325 8326 # Explode each field into a column 8327 dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply( 8328 lambda x: dict(x).get(nomen_field, "") 8329 ) 8330 8331 # Create VCF header field 8332 vcf_reader.infos[nomen_field] = vcf.parser._Info( 8333 nomen_field, 8334 ".", 8335 "String", 8336 nomen_dict.get(nomen_field, "howard calculation NOMEN"), 8337 "howard calculation", 8338 "0", 8339 self.code_type_map.get("String"), 8340 ) 8341 sql_nomen_fields.append( 8342 f""" 8343 CASE 8344 WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('') 8345 THEN concat( 8346 ';{nomen_field}=', 8347 dataframe_hgvs."{nomen_field}" 8348 ) 8349 ELSE '' 8350 END 8351 """ 8352 ) 8353 8354 # SQL set for update 8355 sql_nomen_fields_set = ", ".join(sql_nomen_fields) 8356 8357 # Update 8358 sql_update = f""" 8359 UPDATE variants 8360 SET "INFO" = 8361 concat( 8362 CASE 8363 WHEN "INFO" IS NULL 8364 THEN '' 8365 ELSE "INFO" 8366 END, 8367 {sql_nomen_fields_set} 8368 ) 8369 FROM dataframe_hgvs 8370 WHERE variants."#CHROM" = dataframe_hgvs."#CHROM" 8371 AND variants."POS" = dataframe_hgvs."POS" 8372 AND variants."REF" = dataframe_hgvs."REF" 8373 AND variants."ALT" = dataframe_hgvs."ALT" 8374 """ 8375 self.conn.execute(sql_update) 8376 8377 # Delete dataframe 8378 del dataframe_hgvs 8379 gc.collect() 8380 8381 # Remove added columns 8382 for added_column in added_columns: 8383 self.drop_column(column=added_column) 8384 8385 def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None: 8386 """ 8387 The function `calculation_find_by_pipeline` performs a calculation to find the number of 8388 pipeline/sample for a variant and updates the variant information in a VCF file. 8389 8390 :param tag: The `tag` parameter is a string that represents the annotation field for the 8391 "findbypipeline" information in the VCF file. It is used to create the annotation field in the 8392 VCF header and to update the corresponding field in the variants table, defaults to 8393 findbypipeline 8394 :type tag: str (optional) 8395 """ 8396 8397 # if FORMAT and samples 8398 if ( 8399 "FORMAT" in self.get_header_columns_as_list() 8400 and self.get_header_sample_list() 8401 ): 8402 8403 # findbypipeline annotation field 8404 findbypipeline_tag = tag 8405 8406 # VCF infos tags 8407 vcf_infos_tags = { 8408 findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})", 8409 } 8410 8411 # Prefix 8412 prefix = self.get_explode_infos_prefix() 8413 8414 # Field 8415 findbypipeline_infos = prefix + findbypipeline_tag 8416 8417 # Variants table 8418 table_variants = self.get_table_variants() 8419 8420 # Header 8421 vcf_reader = self.get_header() 8422 8423 # Create variant id 8424 variant_id_column = self.get_variant_id_column() 8425 added_columns = [variant_id_column] 8426 8427 # variant_id, FORMAT and samples 8428 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8429 self.get_header_sample_list() 8430 ) 8431 8432 # Create dataframe 8433 dataframe_findbypipeline = self.get_query_to_df( 8434 f""" SELECT {samples_fields} FROM {table_variants} """ 8435 ) 8436 8437 # Create findbypipeline column 8438 dataframe_findbypipeline[findbypipeline_infos] = ( 8439 dataframe_findbypipeline.apply( 8440 lambda row: findbypipeline( 8441 row, samples=self.get_header_sample_list() 8442 ), 8443 axis=1, 8444 ) 8445 ) 8446 8447 # Add snpeff_hgvs to header 8448 vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info( 8449 findbypipeline_tag, 8450 ".", 8451 "String", 8452 vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"), 8453 "howard calculation", 8454 "0", 8455 self.code_type_map.get("String"), 8456 ) 8457 8458 # Update 8459 sql_update = f""" 8460 UPDATE variants 8461 SET "INFO" = 8462 concat( 8463 CASE 8464 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8465 THEN '' 8466 ELSE concat("INFO", ';') 8467 END, 8468 CASE 8469 WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.') 8470 AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL 8471 THEN concat( 8472 '{findbypipeline_tag}=', 8473 dataframe_findbypipeline."{findbypipeline_infos}" 8474 ) 8475 ELSE '' 8476 END 8477 ) 8478 FROM dataframe_findbypipeline 8479 WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}" 8480 """ 8481 self.conn.execute(sql_update) 8482 8483 # Remove added columns 8484 for added_column in added_columns: 8485 self.drop_column(column=added_column) 8486 8487 # Delete dataframe 8488 del dataframe_findbypipeline 8489 gc.collect() 8490 8491 def calculation_genotype_concordance(self) -> None: 8492 """ 8493 The function `calculation_genotype_concordance` calculates the genotype concordance for 8494 multi-caller VCF files and updates the variant information in the database. 8495 """ 8496 8497 # if FORMAT and samples 8498 if ( 8499 "FORMAT" in self.get_header_columns_as_list() 8500 and self.get_header_sample_list() 8501 ): 8502 8503 # genotypeconcordance annotation field 8504 genotypeconcordance_tag = "genotypeconcordance" 8505 8506 # VCF infos tags 8507 vcf_infos_tags = { 8508 genotypeconcordance_tag: "Concordance of genotype for multi caller VCF", 8509 } 8510 8511 # Prefix 8512 prefix = self.get_explode_infos_prefix() 8513 8514 # Field 8515 genotypeconcordance_infos = prefix + genotypeconcordance_tag 8516 8517 # Variants table 8518 table_variants = self.get_table_variants() 8519 8520 # Header 8521 vcf_reader = self.get_header() 8522 8523 # Create variant id 8524 variant_id_column = self.get_variant_id_column() 8525 added_columns = [variant_id_column] 8526 8527 # variant_id, FORMAT and samples 8528 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8529 self.get_header_sample_list() 8530 ) 8531 8532 # Create dataframe 8533 dataframe_genotypeconcordance = self.get_query_to_df( 8534 f""" SELECT {samples_fields} FROM {table_variants} """ 8535 ) 8536 8537 # Create genotypeconcordance column 8538 dataframe_genotypeconcordance[genotypeconcordance_infos] = ( 8539 dataframe_genotypeconcordance.apply( 8540 lambda row: genotypeconcordance( 8541 row, samples=self.get_header_sample_list() 8542 ), 8543 axis=1, 8544 ) 8545 ) 8546 8547 # Add genotypeconcordance to header 8548 vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info( 8549 genotypeconcordance_tag, 8550 ".", 8551 "String", 8552 vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"), 8553 "howard calculation", 8554 "0", 8555 self.code_type_map.get("String"), 8556 ) 8557 8558 # Update 8559 sql_update = f""" 8560 UPDATE variants 8561 SET "INFO" = 8562 concat( 8563 CASE 8564 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8565 THEN '' 8566 ELSE concat("INFO", ';') 8567 END, 8568 CASE 8569 WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.') 8570 AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL 8571 THEN concat( 8572 '{genotypeconcordance_tag}=', 8573 dataframe_genotypeconcordance."{genotypeconcordance_infos}" 8574 ) 8575 ELSE '' 8576 END 8577 ) 8578 FROM dataframe_genotypeconcordance 8579 WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}" 8580 """ 8581 self.conn.execute(sql_update) 8582 8583 # Remove added columns 8584 for added_column in added_columns: 8585 self.drop_column(column=added_column) 8586 8587 # Delete dataframe 8588 del dataframe_genotypeconcordance 8589 gc.collect() 8590 8591 def calculation_barcode(self, tag: str = "barcode") -> None: 8592 """ 8593 The `calculation_barcode` function calculates barcode values for variants in a VCF file and 8594 updates the INFO field in the file with the calculated barcode values. 8595 8596 :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag 8597 name that will be used for the barcode calculation in the VCF file. If no tag name is provided, 8598 the default tag name is set to "barcode", defaults to barcode 8599 :type tag: str (optional) 8600 """ 8601 8602 # if FORMAT and samples 8603 if ( 8604 "FORMAT" in self.get_header_columns_as_list() 8605 and self.get_header_sample_list() 8606 ): 8607 8608 # barcode annotation field 8609 if not tag: 8610 tag = "barcode" 8611 8612 # VCF infos tags 8613 vcf_infos_tags = { 8614 tag: "barcode calculation (VaRank)", 8615 } 8616 8617 # Prefix 8618 prefix = self.get_explode_infos_prefix() 8619 8620 # Field 8621 barcode_infos = prefix + tag 8622 8623 # Variants table 8624 table_variants = self.get_table_variants() 8625 8626 # Header 8627 vcf_reader = self.get_header() 8628 8629 # Create variant id 8630 variant_id_column = self.get_variant_id_column() 8631 added_columns = [variant_id_column] 8632 8633 # variant_id, FORMAT and samples 8634 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8635 self.get_header_sample_list() 8636 ) 8637 8638 # Create dataframe 8639 dataframe_barcode = self.get_query_to_df( 8640 f""" SELECT {samples_fields} FROM {table_variants} """ 8641 ) 8642 8643 # Create barcode column 8644 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 8645 lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1 8646 ) 8647 8648 # Add barcode to header 8649 vcf_reader.infos[tag] = vcf.parser._Info( 8650 tag, 8651 ".", 8652 "String", 8653 vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)), 8654 "howard calculation", 8655 "0", 8656 self.code_type_map.get("String"), 8657 ) 8658 8659 # Update 8660 sql_update = f""" 8661 UPDATE {table_variants} 8662 SET "INFO" = 8663 concat( 8664 CASE 8665 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8666 THEN '' 8667 ELSE concat("INFO", ';') 8668 END, 8669 CASE 8670 WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.') 8671 AND dataframe_barcode."{barcode_infos}" NOT NULL 8672 THEN concat( 8673 '{tag}=', 8674 dataframe_barcode."{barcode_infos}" 8675 ) 8676 ELSE '' 8677 END 8678 ) 8679 FROM dataframe_barcode 8680 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 8681 """ 8682 self.conn.execute(sql_update) 8683 8684 # Remove added columns 8685 for added_column in added_columns: 8686 self.drop_column(column=added_column) 8687 8688 # Delete dataframe 8689 del dataframe_barcode 8690 gc.collect() 8691 8692 def calculation_barcode_family(self, tag: str = "BCF") -> None: 8693 """ 8694 The `calculation_barcode_family` function calculates barcode values for variants in a VCF file 8695 and updates the INFO field in the file with the calculated barcode values. 8696 8697 :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify 8698 the barcode tag that will be added to the VCF file during the calculation process. If no value 8699 is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF 8700 :type tag: str (optional) 8701 """ 8702 8703 # if FORMAT and samples 8704 if ( 8705 "FORMAT" in self.get_header_columns_as_list() 8706 and self.get_header_sample_list() 8707 ): 8708 8709 # barcode annotation field 8710 if not tag: 8711 tag = "BCF" 8712 8713 # VCF infos tags 8714 vcf_infos_tags = { 8715 tag: "barcode family calculation", 8716 f"{tag}S": "barcode family samples", 8717 } 8718 8719 # Param 8720 param = self.get_param() 8721 log.debug(f"param={param}") 8722 8723 # Prefix 8724 prefix = self.get_explode_infos_prefix() 8725 8726 # PED param 8727 ped = ( 8728 param.get("calculation", {}) 8729 .get("calculations", {}) 8730 .get("BARCODEFAMILY", {}) 8731 .get("family_pedigree", None) 8732 ) 8733 log.debug(f"ped={ped}") 8734 8735 # Load PED 8736 if ped: 8737 8738 # Pedigree is a file 8739 if isinstance(ped, str) and os.path.exists(full_path(ped)): 8740 log.debug("Pedigree is file") 8741 with open(full_path(ped)) as ped: 8742 ped = json.load(ped) 8743 8744 # Pedigree is a string 8745 elif isinstance(ped, str): 8746 log.debug("Pedigree is str") 8747 try: 8748 ped = json.loads(ped) 8749 log.debug("Pedigree is json str") 8750 except ValueError as e: 8751 ped_samples = ped.split(",") 8752 ped = {} 8753 for ped_sample in ped_samples: 8754 ped[ped_sample] = ped_sample 8755 8756 # Pedigree is a dict 8757 elif isinstance(ped, dict): 8758 log.debug("Pedigree is dict") 8759 8760 # Pedigree is not well formatted 8761 else: 8762 msg_error = "Pedigree not well formatted" 8763 log.error(msg_error) 8764 raise ValueError(msg_error) 8765 8766 # Construct list 8767 ped_samples = list(ped.values()) 8768 8769 else: 8770 log.debug("Pedigree not defined. Take all samples") 8771 ped_samples = self.get_header_sample_list() 8772 ped = {} 8773 for ped_sample in ped_samples: 8774 ped[ped_sample] = ped_sample 8775 8776 # Check pedigree 8777 if not ped or len(ped) == 0: 8778 msg_error = f"Error in pedigree: samples {ped_samples}" 8779 log.error(msg_error) 8780 raise ValueError(msg_error) 8781 8782 # Log 8783 log.info( 8784 "Calculation 'BARCODEFAMILY' - Samples: " 8785 + ", ".join([f"{member}='{ped[member]}'" for member in ped]) 8786 ) 8787 log.debug(f"ped_samples={ped_samples}") 8788 8789 # Field 8790 barcode_infos = prefix + tag 8791 8792 # Variants table 8793 table_variants = self.get_table_variants() 8794 8795 # Header 8796 vcf_reader = self.get_header() 8797 8798 # Create variant id 8799 variant_id_column = self.get_variant_id_column() 8800 added_columns = [variant_id_column] 8801 8802 # variant_id, FORMAT and samples 8803 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8804 ped_samples 8805 ) 8806 8807 # Create dataframe 8808 dataframe_barcode = self.get_query_to_df( 8809 f""" SELECT {samples_fields} FROM {table_variants} """ 8810 ) 8811 8812 # Create barcode column 8813 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 8814 lambda row: barcode(row, samples=ped_samples), axis=1 8815 ) 8816 8817 # Add barcode family to header 8818 # Add vaf_normalization to header 8819 vcf_reader.formats[tag] = vcf.parser._Format( 8820 id=tag, 8821 num=".", 8822 type="String", 8823 desc=vcf_infos_tags.get(tag, "barcode family calculation"), 8824 type_code=self.code_type_map.get("String"), 8825 ) 8826 vcf_reader.formats[f"{tag}S"] = vcf.parser._Format( 8827 id=f"{tag}S", 8828 num=".", 8829 type="String", 8830 desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"), 8831 type_code=self.code_type_map.get("String"), 8832 ) 8833 8834 # Update 8835 # for sample in ped_samples: 8836 sql_update_set = [] 8837 for sample in self.get_header_sample_list() + ["FORMAT"]: 8838 if sample in ped_samples: 8839 value = f'dataframe_barcode."{barcode_infos}"' 8840 value_samples = "'" + ",".join(ped_samples) + "'" 8841 elif sample == "FORMAT": 8842 value = f"'{tag}'" 8843 value_samples = f"'{tag}S'" 8844 else: 8845 value = "'.'" 8846 value_samples = "'.'" 8847 format_regex = r"[a-zA-Z0-9\s]" 8848 sql_update_set.append( 8849 f""" 8850 "{sample}" = 8851 concat( 8852 CASE 8853 WHEN {table_variants}."{sample}" = './.' 8854 THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g')) 8855 ELSE {table_variants}."{sample}" 8856 END, 8857 ':', 8858 {value}, 8859 ':', 8860 {value_samples} 8861 ) 8862 """ 8863 ) 8864 8865 sql_update_set_join = ", ".join(sql_update_set) 8866 sql_update = f""" 8867 UPDATE {table_variants} 8868 SET {sql_update_set_join} 8869 FROM dataframe_barcode 8870 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 8871 """ 8872 self.conn.execute(sql_update) 8873 8874 # Remove added columns 8875 for added_column in added_columns: 8876 self.drop_column(column=added_column) 8877 8878 # Delete dataframe 8879 del dataframe_barcode 8880 gc.collect() 8881 8882 def calculation_trio(self) -> None: 8883 """ 8884 The `calculation_trio` function performs trio calculations on a VCF file by adding trio 8885 information to the INFO field of each variant. 8886 """ 8887 8888 # if FORMAT and samples 8889 if ( 8890 "FORMAT" in self.get_header_columns_as_list() 8891 and self.get_header_sample_list() 8892 ): 8893 8894 # trio annotation field 8895 trio_tag = "trio" 8896 8897 # VCF infos tags 8898 vcf_infos_tags = { 8899 "trio": "trio calculation", 8900 } 8901 8902 # Param 8903 param = self.get_param() 8904 8905 # Prefix 8906 prefix = self.get_explode_infos_prefix() 8907 8908 # Trio param 8909 trio_ped = ( 8910 param.get("calculation", {}) 8911 .get("calculations", {}) 8912 .get("TRIO", {}) 8913 .get("trio_pedigree", None) 8914 ) 8915 8916 # Load trio 8917 if trio_ped: 8918 8919 # Trio pedigree is a file 8920 if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)): 8921 log.debug("TRIO pedigree is file") 8922 with open(full_path(trio_ped)) as trio_ped: 8923 trio_ped = json.load(trio_ped) 8924 8925 # Trio pedigree is a string 8926 elif isinstance(trio_ped, str): 8927 log.debug("TRIO pedigree is str") 8928 try: 8929 trio_ped = json.loads(trio_ped) 8930 log.debug("TRIO pedigree is json str") 8931 except ValueError as e: 8932 trio_samples = trio_ped.split(",") 8933 if len(trio_samples) == 3: 8934 trio_ped = { 8935 "father": trio_samples[0], 8936 "mother": trio_samples[1], 8937 "child": trio_samples[2], 8938 } 8939 log.debug("TRIO pedigree is list str") 8940 else: 8941 msg_error = "TRIO pedigree not well formatted" 8942 log.error(msg_error) 8943 raise ValueError(msg_error) 8944 8945 # Trio pedigree is a dict 8946 elif isinstance(trio_ped, dict): 8947 log.debug("TRIO pedigree is dict") 8948 8949 # Trio pedigree is not well formatted 8950 else: 8951 msg_error = "TRIO pedigree not well formatted" 8952 log.error(msg_error) 8953 raise ValueError(msg_error) 8954 8955 # Construct trio list 8956 trio_samples = [ 8957 trio_ped.get("father", ""), 8958 trio_ped.get("mother", ""), 8959 trio_ped.get("child", ""), 8960 ] 8961 8962 else: 8963 log.debug("TRIO pedigree not defined. Take the first 3 samples") 8964 samples_list = self.get_header_sample_list() 8965 if len(samples_list) >= 3: 8966 trio_samples = self.get_header_sample_list()[0:3] 8967 trio_ped = { 8968 "father": trio_samples[0], 8969 "mother": trio_samples[1], 8970 "child": trio_samples[2], 8971 } 8972 else: 8973 msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}" 8974 log.error(msg_error) 8975 raise ValueError(msg_error) 8976 8977 # Check trio pedigree 8978 if not trio_ped or len(trio_ped) != 3: 8979 msg_error = f"Error in TRIO pedigree: {trio_ped}" 8980 log.error(msg_error) 8981 raise ValueError(msg_error) 8982 8983 # Log 8984 log.info( 8985 f"Calculation 'TRIO' - Samples: " 8986 + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped]) 8987 ) 8988 8989 # Field 8990 trio_infos = prefix + trio_tag 8991 8992 # Variants table 8993 table_variants = self.get_table_variants() 8994 8995 # Header 8996 vcf_reader = self.get_header() 8997 8998 # Create variant id 8999 variant_id_column = self.get_variant_id_column() 9000 added_columns = [variant_id_column] 9001 9002 # variant_id, FORMAT and samples 9003 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9004 self.get_header_sample_list() 9005 ) 9006 9007 # Create dataframe 9008 dataframe_trio = self.get_query_to_df( 9009 f""" SELECT {samples_fields} FROM {table_variants} """ 9010 ) 9011 9012 # Create trio column 9013 dataframe_trio[trio_infos] = dataframe_trio.apply( 9014 lambda row: trio(row, samples=trio_samples), axis=1 9015 ) 9016 9017 # Add trio to header 9018 vcf_reader.infos[trio_tag] = vcf.parser._Info( 9019 trio_tag, 9020 ".", 9021 "String", 9022 vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"), 9023 "howard calculation", 9024 "0", 9025 self.code_type_map.get("String"), 9026 ) 9027 9028 # Update 9029 sql_update = f""" 9030 UPDATE {table_variants} 9031 SET "INFO" = 9032 concat( 9033 CASE 9034 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9035 THEN '' 9036 ELSE concat("INFO", ';') 9037 END, 9038 CASE 9039 WHEN dataframe_trio."{trio_infos}" NOT IN ('','.') 9040 AND dataframe_trio."{trio_infos}" NOT NULL 9041 THEN concat( 9042 '{trio_tag}=', 9043 dataframe_trio."{trio_infos}" 9044 ) 9045 ELSE '' 9046 END 9047 ) 9048 FROM dataframe_trio 9049 WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}" 9050 """ 9051 self.conn.execute(sql_update) 9052 9053 # Remove added columns 9054 for added_column in added_columns: 9055 self.drop_column(column=added_column) 9056 9057 # Delete dataframe 9058 del dataframe_trio 9059 gc.collect() 9060 9061 def calculation_vaf_normalization(self) -> None: 9062 """ 9063 The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency) 9064 normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly. 9065 :return: The function does not return anything. 9066 """ 9067 9068 # if FORMAT and samples 9069 if ( 9070 "FORMAT" in self.get_header_columns_as_list() 9071 and self.get_header_sample_list() 9072 ): 9073 9074 # vaf_normalization annotation field 9075 vaf_normalization_tag = "VAF" 9076 9077 # VCF infos tags 9078 vcf_infos_tags = { 9079 "VAF": "VAF Variant Frequency", 9080 } 9081 9082 # Prefix 9083 prefix = self.get_explode_infos_prefix() 9084 9085 # Variants table 9086 table_variants = self.get_table_variants() 9087 9088 # Header 9089 vcf_reader = self.get_header() 9090 9091 # Do not calculate if VAF already exists 9092 if "VAF" in vcf_reader.formats: 9093 log.debug("VAF already on genotypes") 9094 return 9095 9096 # Create variant id 9097 variant_id_column = self.get_variant_id_column() 9098 added_columns = [variant_id_column] 9099 9100 # variant_id, FORMAT and samples 9101 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9102 f""" "{sample}" """ for sample in self.get_header_sample_list() 9103 ) 9104 9105 # Create dataframe 9106 query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """ 9107 log.debug(f"query={query}") 9108 dataframe_vaf_normalization = self.get_query_to_df(query=query) 9109 9110 vaf_normalization_set = [] 9111 9112 # for each sample vaf_normalization 9113 for sample in self.get_header_sample_list(): 9114 dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply( 9115 lambda row: vaf_normalization(row, sample=sample), axis=1 9116 ) 9117 vaf_normalization_set.append( 9118 f""" "{sample}" = dataframe_vaf_normalization."{sample}" """ 9119 ) 9120 9121 # Add VAF to FORMAT 9122 dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[ 9123 "FORMAT" 9124 ].apply(lambda x: str(x) + ":VAF") 9125 vaf_normalization_set.append( 9126 f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """ 9127 ) 9128 9129 # Add vaf_normalization to header 9130 vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format( 9131 id=vaf_normalization_tag, 9132 num="1", 9133 type="Float", 9134 desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"), 9135 type_code=self.code_type_map.get("Float"), 9136 ) 9137 9138 # Create fields to add in INFO 9139 sql_vaf_normalization_set = " , ".join(vaf_normalization_set) 9140 9141 # Update 9142 sql_update = f""" 9143 UPDATE {table_variants} 9144 SET {sql_vaf_normalization_set} 9145 FROM dataframe_vaf_normalization 9146 WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}" 9147 9148 """ 9149 self.conn.execute(sql_update) 9150 9151 # Remove added columns 9152 for added_column in added_columns: 9153 self.drop_column(column=added_column) 9154 9155 # Delete dataframe 9156 del dataframe_vaf_normalization 9157 gc.collect() 9158 9159 def calculation_genotype_stats(self, info: str = "VAF") -> None: 9160 """ 9161 The `calculation_genotype_stats` function calculates genotype statistics for a given information 9162 field in a VCF file and updates the INFO column of the variants table with the calculated 9163 statistics. 9164 9165 :param info: The `info` parameter is a string that represents the type of information for which 9166 genotype statistics are calculated. It is used to generate various VCF info tags for the 9167 statistics, such as the number of occurrences, the list of values, the minimum value, the 9168 maximum value, the mean, the median, defaults to VAF 9169 :type info: str (optional) 9170 """ 9171 9172 # if FORMAT and samples 9173 if ( 9174 "FORMAT" in self.get_header_columns_as_list() 9175 and self.get_header_sample_list() 9176 ): 9177 9178 # vaf_stats annotation field 9179 vaf_stats_tag = info + "_stats" 9180 9181 # VCF infos tags 9182 vcf_infos_tags = { 9183 info + "_stats_nb": f"genotype {info} Statistics - number of {info}", 9184 info + "_stats_list": f"genotype {info} Statistics - list of {info}", 9185 info + "_stats_min": f"genotype {info} Statistics - min {info}", 9186 info + "_stats_max": f"genotype {info} Statistics - max {info}", 9187 info + "_stats_mean": f"genotype {info} Statistics - mean {info}", 9188 info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}", 9189 info 9190 + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}", 9191 } 9192 9193 # Prefix 9194 prefix = self.get_explode_infos_prefix() 9195 9196 # Field 9197 vaf_stats_infos = prefix + vaf_stats_tag 9198 9199 # Variants table 9200 table_variants = self.get_table_variants() 9201 9202 # Header 9203 vcf_reader = self.get_header() 9204 9205 # Create variant id 9206 variant_id_column = self.get_variant_id_column() 9207 added_columns = [variant_id_column] 9208 9209 # variant_id, FORMAT and samples 9210 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9211 self.get_header_sample_list() 9212 ) 9213 9214 # Create dataframe 9215 dataframe_vaf_stats = self.get_query_to_df( 9216 f""" SELECT {samples_fields} FROM {table_variants} """ 9217 ) 9218 9219 # Create vaf_stats column 9220 dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply( 9221 lambda row: genotype_stats( 9222 row, samples=self.get_header_sample_list(), info=info 9223 ), 9224 axis=1, 9225 ) 9226 9227 # List of vcf tags 9228 sql_vaf_stats_fields = [] 9229 9230 # Check all VAF stats infos 9231 for stat in vcf_infos_tags: 9232 9233 # Extract stats 9234 dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply( 9235 lambda x: dict(x).get(stat, "") 9236 ) 9237 9238 # Add snpeff_hgvs to header 9239 vcf_reader.infos[stat] = vcf.parser._Info( 9240 stat, 9241 ".", 9242 "String", 9243 vcf_infos_tags.get(stat, "genotype statistics"), 9244 "howard calculation", 9245 "0", 9246 self.code_type_map.get("String"), 9247 ) 9248 9249 if len(sql_vaf_stats_fields): 9250 sep = ";" 9251 else: 9252 sep = "" 9253 9254 # Create fields to add in INFO 9255 sql_vaf_stats_fields.append( 9256 f""" 9257 CASE 9258 WHEN dataframe_vaf_stats."{stat}" NOT NULL 9259 THEN concat( 9260 '{sep}{stat}=', 9261 dataframe_vaf_stats."{stat}" 9262 ) 9263 ELSE '' 9264 END 9265 """ 9266 ) 9267 9268 # SQL set for update 9269 sql_vaf_stats_fields_set = ", ".join(sql_vaf_stats_fields) 9270 9271 # Update 9272 sql_update = f""" 9273 UPDATE {table_variants} 9274 SET "INFO" = 9275 concat( 9276 CASE 9277 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9278 THEN '' 9279 ELSE concat("INFO", ';') 9280 END, 9281 {sql_vaf_stats_fields_set} 9282 ) 9283 FROM dataframe_vaf_stats 9284 WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}" 9285 9286 """ 9287 self.conn.execute(sql_update) 9288 9289 # Remove added columns 9290 for added_column in added_columns: 9291 self.drop_column(column=added_column) 9292 9293 # Delete dataframe 9294 del dataframe_vaf_stats 9295 gc.collect() 9296 9297 def calculation_transcripts_json(self, info: str = "transcripts_json") -> None: 9298 """ 9299 The function `calculation_transcripts_json` creates a transcripts table and adds an info field 9300 to it if transcripts are available. 9301 9302 :param info: The `info` parameter in the `calculation_transcripts_json` method is a string 9303 parameter that specifies the information field to be used in the transcripts JSON. It has a 9304 default value of "transcripts_json" if no value is provided when calling the method, defaults to 9305 transcripts_json 9306 :type info: str (optional) 9307 """ 9308 9309 # Create transcripts table 9310 transcripts_table = self.create_transcript_view() 9311 9312 # Add info field 9313 if transcripts_table: 9314 self.transcript_view_to_variants( 9315 transcripts_table=transcripts_table, transcripts_info_field=info 9316 ) 9317 else: 9318 log.info("No Transcripts to process. Check param.json file configuration") 9319 9320 def calculation_transcripts_prioritization(self) -> None: 9321 """ 9322 The function `calculation_transcripts_prioritization` creates a transcripts table and 9323 prioritizes transcripts based on certain criteria. 9324 """ 9325 9326 # Create transcripts table 9327 transcripts_table = self.create_transcript_view() 9328 9329 # Add info field 9330 if transcripts_table: 9331 self.transcripts_prioritization(transcripts_table=transcripts_table) 9332 else: 9333 log.info("No Transcripts to process. Check param.json file configuration") 9334 9335 ############### 9336 # Transcripts # 9337 ############### 9338 9339 def transcripts_prioritization( 9340 self, transcripts_table: str = None, param: dict = {} 9341 ) -> bool: 9342 """ 9343 The `transcripts_prioritization` function prioritizes transcripts based on certain parameters 9344 and updates the variants table with the prioritized information. 9345 9346 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 9347 of the table containing transcripts data. If no value is provided, it defaults to "transcripts". 9348 This parameter is used to identify the table where the transcripts data is stored for the 9349 prioritization process 9350 :type transcripts_table: str 9351 :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary 9352 that contains various configuration settings for the prioritization process of transcripts. It 9353 is used to customize the behavior of the prioritization algorithm and includes settings such as 9354 the prefix for prioritization fields, default profiles, and other 9355 :type param: dict 9356 :return: The function `transcripts_prioritization` returns a boolean value `True` if the 9357 transcripts prioritization process is successfully completed, and `False` if there are any 9358 issues or if no profile is defined for transcripts prioritization. 9359 """ 9360 9361 log.debug("Start transcripts prioritization...") 9362 9363 # Param 9364 if not param: 9365 param = self.get_param() 9366 9367 # Variants table 9368 table_variants = self.get_table_variants() 9369 log.debug(f"transcripts_table={transcripts_table}") 9370 # Transcripts table 9371 if transcripts_table is None: 9372 log.debug(f"transcripts_table={transcripts_table}") 9373 transcripts_table = self.create_transcript_view( 9374 transcripts_table="transcripts", param=param 9375 ) 9376 log.debug(f"transcripts_table={transcripts_table}") 9377 if transcripts_table is None: 9378 msg_err = "No Transcripts table availalble" 9379 log.error(msg_err) 9380 raise ValueError(msg_err) 9381 9382 # Get transcripts columns 9383 columns_as_list_query = f""" 9384 DESCRIBE {transcripts_table} 9385 """ 9386 columns_as_list = list( 9387 self.get_query_to_df(columns_as_list_query)["column_name"] 9388 ) 9389 9390 # Create INFO if not exists 9391 if "INFO" not in columns_as_list: 9392 query_add_info = f""" 9393 ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT ''; 9394 """ 9395 self.execute_query(query_add_info) 9396 9397 # Prioritization param and Force only PZ Score and Flag 9398 pz_param = param.get("transcripts", {}).get("prioritization", {}) 9399 pz_fields_score = pz_param.get("pzprefix", "PTZ") + "Score" 9400 pz_fields_flag = pz_param.get("pzprefix", "PTZ") + "Flag" 9401 pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript" 9402 pz_param["pzfields"] = [pz_fields_score, pz_fields_flag] 9403 pz_profile_default = ( 9404 param.get("transcripts", {}).get("prioritization", {}).get("profiles", None) 9405 ) 9406 9407 # Exit if no profile 9408 if pz_profile_default is None: 9409 log.warning("No profile defined for transcripts prioritization") 9410 return False 9411 9412 # Prioritization 9413 prioritization_result = self.prioritization( 9414 table=transcripts_table, 9415 pz_param=param.get("transcripts", {}).get("prioritization", {}), 9416 ) 9417 if not prioritization_result: 9418 log.warning("Transcripts prioritization not processed") 9419 return False 9420 9421 # Explode PZ fields 9422 self.explode_infos( 9423 table=transcripts_table, 9424 fields=param.get("transcripts", {}) 9425 .get("prioritization", {}) 9426 .get("pzfields", []), 9427 ) 9428 9429 # Export Transcripts prioritization infos to variants table 9430 query_update = f""" 9431 WITH RankedTranscripts AS ( 9432 SELECT 9433 "#CHROM", POS, REF, ALT, transcript, {pz_fields_score}, {pz_fields_flag}, 9434 ROW_NUMBER() OVER ( 9435 PARTITION BY "#CHROM", POS, REF, ALT 9436 ORDER BY {pz_fields_flag} ASC, {pz_fields_score} DESC, transcript ASC 9437 ) AS rn 9438 FROM 9439 {transcripts_table} 9440 ) 9441 UPDATE {table_variants} 9442 SET 9443 INFO = CONCAT(CASE 9444 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9445 THEN '' 9446 ELSE concat("INFO", ';') 9447 END, 9448 concat('{pz_fields_transcripts}=', transcript, ';{pz_fields_score}=', {pz_fields_score}, ';{pz_fields_flag}=', {pz_fields_flag}) 9449 ) 9450 FROM 9451 RankedTranscripts 9452 WHERE 9453 rn = 1 9454 AND variants."#CHROM" = RankedTranscripts."#CHROM" 9455 AND variants."POS" = RankedTranscripts."POS" 9456 AND variants."REF" = RankedTranscripts."REF" 9457 AND variants."ALT" = RankedTranscripts."ALT" 9458 9459 """ 9460 self.execute_query(query=query_update) 9461 9462 # Add PZ Transcript in header 9463 self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info( 9464 pz_fields_transcripts, 9465 ".", 9466 "String", 9467 f"Transcript selected from transcripts prioritization process, profile {pz_profile_default}", 9468 "unknown", 9469 "unknown", 9470 code_type_map["String"], 9471 ) 9472 9473 # Return 9474 return True 9475 9476 def create_transcript_view_from_columns_map( 9477 self, 9478 transcripts_table: str = "transcripts", 9479 columns_maps: dict = {}, 9480 added_columns: list = [], 9481 temporary_tables: list = None, 9482 annotation_fields: list = None, 9483 ) -> tuple[list, list, list]: 9484 """ 9485 The `create_transcript_view_from_columns_map` function generates a temporary table view based on 9486 specified columns mapping for transcripts data. 9487 9488 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of 9489 the table where the transcripts data is stored or will be stored in the database. This table 9490 typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores, 9491 predictions, etc. It defaults to "transcripts, defaults to transcripts 9492 :type transcripts_table: str (optional) 9493 :param columns_maps: The `columns_maps` parameter is a dictionary that contains information about 9494 how to map columns from a transcripts table to create a view. Each entry in the `columns_maps` list 9495 represents a mapping configuration for a specific set of columns. It typically includes details such 9496 as the main transcript column and additional information columns 9497 :type columns_maps: dict 9498 :param added_columns: The `added_columns` parameter in the `create_transcript_view_from_columns_map` 9499 function is a list that stores the additional columns that will be added to the view being created 9500 based on the columns map provided. These columns are generated by exploding the transcript 9501 information columns along with the main transcript column 9502 :type added_columns: list 9503 :param temporary_tables: The `temporary_tables` parameter in the 9504 `create_transcript_view_from_columns_map` function is a list that stores the names of temporary 9505 tables created during the process of creating a transcript view from a columns map. These temporary 9506 tables are used to store intermediate results or transformations before the final view is generated 9507 :type temporary_tables: list 9508 :param annotation_fields: The `annotation_fields` parameter in the 9509 `create_transcript_view_from_columns_map` function is a list that stores the fields that are used 9510 for annotation in the query view creation process. These fields are extracted from the 9511 `transcripts_column` and `transcripts_infos_columns` specified in the `columns 9512 :type annotation_fields: list 9513 :return: The function `create_transcript_view_from_columns_map` returns a tuple containing three 9514 lists: `added_columns`, `temporary_tables`, and `annotation_fields`. 9515 """ 9516 9517 log.debug("Start transcrpts view creation from columns map...") 9518 9519 # "from_columns_map": [ 9520 # { 9521 # "transcripts_column": "Ensembl_transcriptid", 9522 # "transcripts_infos_columns": [ 9523 # "genename", 9524 # "Ensembl_geneid", 9525 # "LIST_S2_score", 9526 # "LIST_S2_pred", 9527 # ], 9528 # }, 9529 # { 9530 # "transcripts_column": "Ensembl_transcriptid", 9531 # "transcripts_infos_columns": [ 9532 # "genename", 9533 # "VARITY_R_score", 9534 # "Aloft_pred", 9535 # ], 9536 # }, 9537 # ], 9538 9539 # Init 9540 if temporary_tables is None: 9541 temporary_tables = [] 9542 if annotation_fields is None: 9543 annotation_fields = [] 9544 9545 # Variants table 9546 table_variants = self.get_table_variants() 9547 9548 for columns_map in columns_maps: 9549 9550 # Transcript column 9551 transcripts_column = columns_map.get("transcripts_column", None) 9552 9553 # Transcripts infos columns 9554 transcripts_infos_columns = columns_map.get("transcripts_infos_columns", []) 9555 9556 if transcripts_column is not None: 9557 9558 # Explode 9559 added_columns += self.explode_infos( 9560 fields=[transcripts_column] + transcripts_infos_columns 9561 ) 9562 9563 # View clauses 9564 clause_select = [] 9565 for field in [transcripts_column] + transcripts_infos_columns: 9566 clause_select.append( 9567 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 9568 ) 9569 if field not in [transcripts_column]: 9570 annotation_fields.append(field) 9571 9572 # Querey View 9573 query = f""" 9574 SELECT 9575 "#CHROM", POS, REF, ALT, 9576 "{transcripts_column}" AS 'transcript', 9577 {", ".join(clause_select)} 9578 FROM ( 9579 SELECT 9580 "#CHROM", POS, REF, ALT, 9581 {", ".join(clause_select)} 9582 FROM {table_variants} 9583 ) 9584 WHERE "{transcripts_column}" IS NOT NULL 9585 """ 9586 9587 # Create temporary table 9588 temporary_table = transcripts_table + "".join( 9589 random.choices(string.ascii_uppercase + string.digits, k=10) 9590 ) 9591 9592 # Temporary_tables 9593 temporary_tables.append(temporary_table) 9594 query_view = f""" 9595 CREATE TEMPORARY TABLE {temporary_table} 9596 AS ({query}) 9597 """ 9598 self.execute_query(query=query_view) 9599 9600 return added_columns, temporary_tables, annotation_fields 9601 9602 def create_transcript_view_from_column_format( 9603 self, 9604 transcripts_table: str = "transcripts", 9605 column_formats: dict = {}, 9606 temporary_tables: list = None, 9607 annotation_fields: list = None, 9608 ) -> tuple[list, list, list]: 9609 """ 9610 The `create_transcript_view_from_column_format` function generates a transcript view based on 9611 specified column formats, adds additional columns and annotation fields, and returns the list of 9612 temporary tables and annotation fields. 9613 9614 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of 9615 the table containing the transcripts data. This table will be used as the base table for creating 9616 the transcript view. The default value for this parameter is "transcripts", but you can provide a 9617 different table name if needed, defaults to transcripts 9618 :type transcripts_table: str (optional) 9619 :param column_formats: The `column_formats` parameter is a dictionary that contains information 9620 about the columns to be used for creating the transcript view. Each entry in the dictionary 9621 specifies the mapping between a transcripts column and a transcripts infos column. For example, in 9622 the provided code snippet: 9623 :type column_formats: dict 9624 :param temporary_tables: The `temporary_tables` parameter in the 9625 `create_transcript_view_from_column_format` function is a list that stores the names of temporary 9626 views created during the process of creating a transcript view from a column format. These temporary 9627 views are used to manipulate and extract data before generating the final transcript view. It 9628 :type temporary_tables: list 9629 :param annotation_fields: The `annotation_fields` parameter in the 9630 `create_transcript_view_from_column_format` function is a list that stores the annotation fields 9631 that are extracted from the temporary views created during the process. These annotation fields are 9632 obtained by querying the temporary views and extracting the column names excluding specific columns 9633 like `#CH 9634 :type annotation_fields: list 9635 :return: The `create_transcript_view_from_column_format` function returns two lists: 9636 `temporary_tables` and `annotation_fields`. 9637 """ 9638 9639 log.debug("Start transcrpts view creation from column format...") 9640 9641 # "from_column_format": [ 9642 # { 9643 # "transcripts_column": "ANN", 9644 # "transcripts_infos_column": "Feature_ID", 9645 # } 9646 # ], 9647 9648 # Init 9649 if temporary_tables is None: 9650 temporary_tables = [] 9651 if annotation_fields is None: 9652 annotation_fields = [] 9653 9654 for column_format in column_formats: 9655 9656 # annotation field and transcript annotation field 9657 annotation_field = column_format.get("transcripts_column", "ANN") 9658 transcript_annotation = column_format.get( 9659 "transcripts_infos_column", "Feature_ID" 9660 ) 9661 9662 # Temporary View name 9663 temporary_view_name = transcripts_table + "".join( 9664 random.choices(string.ascii_uppercase + string.digits, k=10) 9665 ) 9666 9667 # Create temporary view name 9668 temporary_view_name = self.annotation_format_to_table( 9669 uniquify=True, 9670 annotation_field=annotation_field, 9671 view_name=temporary_view_name, 9672 annotation_id=transcript_annotation, 9673 ) 9674 9675 # Annotation fields 9676 if temporary_view_name: 9677 query_annotation_fields = f""" 9678 SELECT * 9679 FROM ( 9680 DESCRIBE SELECT * 9681 FROM {temporary_view_name} 9682 ) 9683 WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT') 9684 """ 9685 df_annotation_fields = self.get_query_to_df( 9686 query=query_annotation_fields 9687 ) 9688 9689 # Add temporary view and annotation fields 9690 temporary_tables.append(temporary_view_name) 9691 annotation_fields += list(set(df_annotation_fields["column_name"])) 9692 9693 return temporary_tables, annotation_fields 9694 9695 def create_transcript_view( 9696 self, 9697 transcripts_table: str = None, 9698 transcripts_table_drop: bool = True, 9699 param: dict = {}, 9700 ) -> str: 9701 """ 9702 The `create_transcript_view` function generates a transcript view by processing data from a 9703 specified table based on provided parameters and structural information. 9704 9705 :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function 9706 is used to specify the name of the table that will store the final transcript view data. If a table 9707 name is not provided, the function will create a new table to store the transcript view data, and by 9708 default,, defaults to transcripts 9709 :type transcripts_table: str (optional) 9710 :param transcripts_table_drop: The `transcripts_table_drop` parameter in the 9711 `create_transcript_view` function is a boolean parameter that determines whether to drop the 9712 existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`, 9713 the function will drop the existing transcripts table if it exists, defaults to True 9714 :type transcripts_table_drop: bool (optional) 9715 :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that 9716 contains information needed to create a transcript view. It includes details such as the structure 9717 of the transcripts, columns mapping, column formats, and other necessary information for generating 9718 the view. This parameter allows for flexibility and customization 9719 :type param: dict 9720 :return: The `create_transcript_view` function returns the name of the transcripts table that was 9721 created or modified during the execution of the function. 9722 """ 9723 9724 log.debug("Start transcripts view creation...") 9725 9726 # Default 9727 transcripts_table_default = "transcripts" 9728 9729 # Param 9730 if not param: 9731 param = self.get_param() 9732 9733 # Struct 9734 struct = param.get("transcripts", {}).get("struct", None) 9735 9736 if struct: 9737 9738 # Transcripts table 9739 if transcripts_table is None: 9740 transcripts_table = param.get("transcripts", {}).get( 9741 "table", transcripts_table_default 9742 ) 9743 9744 # added_columns 9745 added_columns = [] 9746 9747 # Temporary tables 9748 temporary_tables = [] 9749 9750 # Annotation fields 9751 annotation_fields = [] 9752 9753 # from columns map 9754 columns_maps = struct.get("from_columns_map", []) 9755 added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = ( 9756 self.create_transcript_view_from_columns_map( 9757 transcripts_table=transcripts_table, 9758 columns_maps=columns_maps, 9759 added_columns=added_columns, 9760 temporary_tables=temporary_tables, 9761 annotation_fields=annotation_fields, 9762 ) 9763 ) 9764 added_columns += added_columns_tmp 9765 temporary_tables += temporary_tables_tmp 9766 annotation_fields += annotation_fields_tmp 9767 9768 # from column format 9769 column_formats = struct.get("from_column_format", []) 9770 temporary_tables_tmp, annotation_fields_tmp = ( 9771 self.create_transcript_view_from_column_format( 9772 transcripts_table=transcripts_table, 9773 column_formats=column_formats, 9774 temporary_tables=temporary_tables, 9775 annotation_fields=annotation_fields, 9776 ) 9777 ) 9778 temporary_tables += temporary_tables_tmp 9779 annotation_fields += annotation_fields_tmp 9780 9781 # Merge temporary tables query 9782 query_merge = "" 9783 for temporary_table in temporary_tables: 9784 9785 # First temporary table 9786 if not query_merge: 9787 query_merge = f""" 9788 SELECT * FROM {temporary_table} 9789 """ 9790 # other temporary table (using UNION) 9791 else: 9792 query_merge += f""" 9793 UNION BY NAME SELECT * FROM {temporary_table} 9794 """ 9795 9796 # Merge on transcript 9797 query_merge_on_transcripts_annotation_fields = [] 9798 # Aggregate all annotations fields 9799 for annotation_field in set(annotation_fields): 9800 query_merge_on_transcripts_annotation_fields.append( 9801 f""" list_aggregate(list_distinct(array_agg({annotation_field})), 'string_agg', ',') AS {annotation_field} """ 9802 ) 9803 # Query for transcripts view 9804 query_merge_on_transcripts = f""" 9805 SELECT "#CHROM", POS, REF, ALT, transcript, {", ".join(query_merge_on_transcripts_annotation_fields)} 9806 FROM ({query_merge}) 9807 GROUP BY "#CHROM", POS, REF, ALT, transcript 9808 """ 9809 9810 # Drop transcript view is necessary 9811 if transcripts_table_drop: 9812 query_drop = f""" 9813 DROP TABLE IF EXISTS {transcripts_table}; 9814 """ 9815 self.execute_query(query=query_drop) 9816 9817 # Merge and create transcript view 9818 query_create_view = f""" 9819 CREATE TABLE IF NOT EXISTS {transcripts_table} 9820 AS {query_merge_on_transcripts} 9821 """ 9822 self.execute_query(query=query_create_view) 9823 9824 # Remove added columns 9825 for added_column in added_columns: 9826 self.drop_column(column=added_column) 9827 9828 else: 9829 9830 transcripts_table = None 9831 9832 return transcripts_table 9833 9834 def annotation_format_to_table( 9835 self, 9836 uniquify: bool = True, 9837 annotation_field: str = "ANN", 9838 annotation_id: str = "Feature_ID", 9839 view_name: str = "transcripts", 9840 ) -> str: 9841 """ 9842 The function `annotation_format_to_table` converts annotation data from a VCF file into a structured 9843 table format. 9844 9845 :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure unique 9846 values in the output or not. If set to `True`, the function will make sure that the output values 9847 are unique, defaults to True 9848 :type uniquify: bool (optional) 9849 :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file that 9850 contains the annotation information for each variant. This field is used to extract the annotation 9851 details for further processing in the function, defaults to ANN 9852 :type annotation_field: str (optional) 9853 :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method is 9854 used to specify the identifier for the annotation feature. This identifier will be used as a column 9855 name in the resulting table or view that is created based on the annotation data. It helps in 9856 uniquely identifying each annotation entry in the, defaults to Feature_ID 9857 :type annotation_id: str (optional) 9858 :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used to 9859 specify the name of the temporary table that will be created to store the transformed annotation 9860 data. This table will hold the extracted information from the annotation field in a structured 9861 format for further processing or analysis, defaults to transcripts 9862 :type view_name: str (optional) 9863 :return: The function `annotation_format_to_table` is returning the name of the view created, which 9864 is stored in the variable `view_name`. 9865 """ 9866 9867 # Annotation field 9868 annotation_format = "annotation_explode" 9869 9870 # Transcript annotation 9871 annotation_id = "".join(char for char in annotation_id if char.isalnum()) 9872 9873 # Prefix 9874 prefix = self.get_explode_infos_prefix() 9875 if prefix: 9876 prefix = "INFO/" 9877 9878 # Annotation fields 9879 annotation_infos = prefix + annotation_field 9880 annotation_format_infos = prefix + annotation_format 9881 9882 # Variants table 9883 table_variants = self.get_table_variants() 9884 9885 # Header 9886 vcf_reader = self.get_header() 9887 9888 # Add columns 9889 added_columns = [] 9890 9891 # Explode HGVS field in column 9892 added_columns += self.explode_infos(fields=[annotation_field]) 9893 9894 if annotation_field in vcf_reader.infos: 9895 9896 # Extract ANN header 9897 ann_description = vcf_reader.infos[annotation_field].desc 9898 pattern = r"'(.+?)'" 9899 match = re.search(pattern, ann_description) 9900 if match: 9901 ann_header_match = match.group(1).split(" | ") 9902 ann_header = [] 9903 ann_header_desc = {} 9904 for i in range(len(ann_header_match)): 9905 ann_header_info = "".join( 9906 char for char in ann_header_match[i] if char.isalnum() 9907 ) 9908 ann_header.append(ann_header_info) 9909 ann_header_desc[ann_header_info] = ann_header_match[i] 9910 if not ann_header_desc: 9911 raise ValueError("Invalid header description format") 9912 else: 9913 raise ValueError("Invalid header description format") 9914 9915 # Create variant id 9916 variant_id_column = self.get_variant_id_column() 9917 added_columns += [variant_id_column] 9918 9919 # Create dataframe 9920 dataframe_annotation_format = self.get_query_to_df( 9921 f""" SELECT "#CHROM", POS, REF, ALT, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """ 9922 ) 9923 9924 # Create annotation columns 9925 dataframe_annotation_format[ 9926 annotation_format_infos 9927 ] = dataframe_annotation_format[annotation_infos].apply( 9928 lambda x: explode_annotation_format( 9929 annotation=str(x), 9930 uniquify=uniquify, 9931 output_format="JSON", 9932 prefix="", 9933 header=list(ann_header_desc.values()), 9934 ) 9935 ) 9936 9937 # Find keys 9938 query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;""" 9939 df_keys = self.get_query_to_df(query=query_json) 9940 9941 # Check keys 9942 query_json_key = [] 9943 for _, row in df_keys.iterrows(): 9944 9945 # Key 9946 key = row.iloc[0] 9947 9948 # key_clean 9949 key_clean = "".join(char for char in key if char.isalnum()) 9950 9951 # Type 9952 query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');""" 9953 9954 # Get DataFrame from query 9955 df_json_type = self.get_query_to_df(query=query_json_type) 9956 9957 # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN 9958 with pd.option_context("future.no_silent_downcasting", True): 9959 df_json_type.fillna(value="", inplace=True) 9960 replace_dict = {None: np.nan, "": np.nan} 9961 df_json_type.replace(replace_dict, inplace=True) 9962 df_json_type.dropna(inplace=True) 9963 9964 # Detect column type 9965 column_type = detect_column_type(df_json_type[key_clean]) 9966 9967 # Append 9968 query_json_key.append( 9969 f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type} AS '{prefix}{key_clean}' """ 9970 ) 9971 9972 # Create view 9973 query_view = f"""CREATE TEMPORARY TABLE {view_name} AS (SELECT *, {annotation_id} AS 'transcript' FROM (SELECT "#CHROM", POS, REF, ALT, {",".join(query_json_key)} FROM dataframe_annotation_format));""" 9974 self.execute_query(query=query_view) 9975 9976 else: 9977 9978 # Return None 9979 view_name = None 9980 9981 # Remove added columns 9982 for added_column in added_columns: 9983 self.drop_column(column=added_column) 9984 9985 return view_name 9986 9987 def transcript_view_to_variants( 9988 self, 9989 transcripts_table: str = None, 9990 transcripts_column_id: str = None, 9991 transcripts_info_json: str = None, 9992 transcripts_info_field: str = None, 9993 param: dict = {}, 9994 ) -> bool: 9995 """ 9996 The function `transcript_view_to_variants` takes input parameters related to transcripts and updates 9997 a variants table with information from the transcripts in JSON format. 9998 9999 :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the table 10000 containing the transcripts data. If this parameter is not provided, the function will attempt to 10001 retrieve it from the `param` dictionary or use a default value of "transcripts" 10002 :type transcripts_table: str 10003 :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the column in 10004 the `transcripts_table` that contains the unique identifier for each transcript. This identifier is 10005 used to match transcripts with variants in the database 10006 :type transcripts_column_id: str 10007 :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name of 10008 the column in the variants table where the transcripts information will be stored in JSON format 10009 :type transcripts_info_json: str 10010 :param transcripts_info_field: The `transcripts_info_field` parameter is used to specify the field 10011 in the VCF header that will contain information about transcripts in JSON format. This field will be 10012 added to the VCF header as an INFO field with the specified name 10013 :type transcripts_info_field: str 10014 :param param: The `transcript_view_to_variants` method takes several parameters: 10015 :type param: dict 10016 :return: The function `transcript_view_to_variants` returns a boolean value, which is `True` if the 10017 operation is successful and `False` if certain conditions are not met. 10018 """ 10019 10020 log.debug("Start transcripts view to JSON...") 10021 10022 # Default 10023 transcripts_table_default = "transcripts" 10024 transcripts_column_id_default = "transcript" 10025 transcripts_info_json_default = None 10026 transcripts_info_field_default = None 10027 10028 # Param 10029 if not param: 10030 param = self.get_param() 10031 10032 # Transcripts table 10033 if transcripts_table is None: 10034 transcripts_table = param.get("transcripts", {}).get( 10035 "table", transcripts_table_default 10036 ) 10037 10038 # Transcripts column ID 10039 if transcripts_column_id is None: 10040 transcripts_column_id = param.get("transcripts", {}).get( 10041 "column_id", transcripts_column_id_default 10042 ) 10043 10044 # Transcripts info field 10045 if transcripts_info_json is None: 10046 transcripts_info_json = param.get("transcripts", {}).get( 10047 "transcripts_info_json", transcripts_info_json_default 10048 ) 10049 10050 # Transcripts info field 10051 if transcripts_info_field is None: 10052 transcripts_info_field = param.get("transcripts", {}).get( 10053 "transcripts_info_field", transcripts_info_field_default 10054 ) 10055 10056 # Variants table 10057 table_variants = self.get_table_variants() 10058 10059 # Check info columns param 10060 if transcripts_info_json is None and transcripts_info_field is None: 10061 return False 10062 10063 # Transcripts infos columns 10064 query_transcripts_infos_columns = f""" 10065 SELECT * 10066 FROM ( 10067 DESCRIBE SELECT * FROM {transcripts_table} 10068 ) 10069 WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}') 10070 """ 10071 transcripts_infos_columns = list( 10072 self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"] 10073 ) 10074 10075 # View results 10076 clause_select = [] 10077 clause_to_json = [] 10078 for field in transcripts_infos_columns: 10079 clause_select.append( 10080 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10081 ) 10082 clause_to_json.append(f""" '{field}': "{field}" """) 10083 10084 # Update 10085 update_set = [] 10086 10087 # VCF header 10088 vcf_reader = self.get_header() 10089 10090 # Transcripts to info column in JSON 10091 if transcripts_info_json is not None: 10092 10093 # Create column on variants table 10094 self.add_column( 10095 table_name=table_variants, 10096 column_name=transcripts_info_json, 10097 column_type="JSON", 10098 default_value=None, 10099 drop=False, 10100 ) 10101 10102 # Add to update 10103 update_set.append( 10104 f""" {transcripts_info_json}=t.{transcripts_info_json} """ 10105 ) 10106 10107 # Add header 10108 vcf_reader.infos[transcripts_info_json] = vcf.parser._Info( 10109 transcripts_info_json, 10110 ".", 10111 "String", 10112 "Transcripts in JSON format", 10113 "unknwon", 10114 "unknwon", 10115 self.code_type_map["String"], 10116 ) 10117 10118 # Transcripts to info field in JSON 10119 if transcripts_info_field is not None: 10120 10121 # Add to update 10122 update_set.append( 10123 f""" 10124 INFO = concat( 10125 CASE 10126 WHEN INFO NOT IN ('', '.') 10127 THEN INFO 10128 ELSE '' 10129 END, 10130 CASE 10131 WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.') 10132 THEN concat( 10133 ';{transcripts_info_field}=', 10134 t.{transcripts_info_json} 10135 ) 10136 ELSE '' 10137 END 10138 ) 10139 """ 10140 ) 10141 10142 # Add header 10143 vcf_reader.infos[transcripts_info_field] = vcf.parser._Info( 10144 transcripts_info_field, 10145 ".", 10146 "String", 10147 "Transcripts in JSON format", 10148 "unknwon", 10149 "unknwon", 10150 self.code_type_map["String"], 10151 ) 10152 10153 # Update query 10154 query_update = f""" 10155 UPDATE {table_variants} 10156 SET {", ".join(update_set)} 10157 FROM 10158 ( 10159 SELECT 10160 "#CHROM", POS, REF, ALT, 10161 concat( 10162 '{{', 10163 string_agg( 10164 '"' || "{transcripts_column_id}" || '":' || 10165 to_json(json_output) 10166 ), 10167 '}}' 10168 )::JSON AS {transcripts_info_json} 10169 FROM 10170 ( 10171 SELECT 10172 "#CHROM", POS, REF, ALT, 10173 "{transcripts_column_id}", 10174 to_json( 10175 {{{",".join(clause_to_json)}}} 10176 )::JSON AS json_output 10177 FROM 10178 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 10179 WHERE "{transcripts_column_id}" IS NOT NULL 10180 ) 10181 GROUP BY "#CHROM", POS, REF, ALT 10182 ) AS t 10183 WHERE {table_variants}."#CHROM" = t."#CHROM" 10184 AND {table_variants}."POS" = t."POS" 10185 AND {table_variants}."REF" = t."REF" 10186 AND {table_variants}."ALT" = t."ALT" 10187 """ 10188 10189 self.execute_query(query=query_update) 10190 10191 return True
36 def __init__( 37 self, 38 conn=None, 39 input: str = None, 40 output: str = None, 41 config: dict = {}, 42 param: dict = {}, 43 load: bool = False, 44 ) -> None: 45 """ 46 The function `__init__` initializes the variables, sets the input, output, config, param, connexion and 47 header 48 49 :param conn: the connection to the database 50 :param input: the input file 51 :param output: the output file 52 :param config: a dictionary containing the configuration of the model 53 :param param: a dictionary containing the parameters of the model 54 """ 55 56 # Init variables 57 self.init_variables() 58 59 # Input 60 self.set_input(input) 61 62 # Config 63 self.set_config(config) 64 65 # Param 66 self.set_param(param) 67 68 # Output 69 self.set_output(output) 70 71 # connexion 72 self.set_connexion(conn) 73 74 # Header 75 self.set_header() 76 77 # Load data 78 if load: 79 self.load_data()
The function __init__ initializes the variables, sets the input, output, config, param, connexion and
header
Parameters
- conn: the connection to the database
- input: the input file
- output: the output file
- config: a dictionary containing the configuration of the model
- param: a dictionary containing the parameters of the model
81 def set_input(self, input: str = None) -> None: 82 """ 83 The function `set_input` takes a file name as input, extracts the name and extension, and sets 84 attributes in the class accordingly. 85 86 :param input: The `set_input` method in the provided code snippet is used to set attributes 87 related to the input file. Here's a breakdown of the parameters and their usage in the method: 88 :type input: str 89 """ 90 91 if input and not isinstance(input, str): 92 try: 93 self.input = input.name 94 except: 95 log.error(f"Input file '{input} in bad format") 96 raise ValueError(f"Input file '{input} in bad format") 97 else: 98 self.input = input 99 100 # Input format 101 if input: 102 input_name, input_extension = os.path.splitext(self.input) 103 self.input_name = input_name 104 self.input_extension = input_extension 105 self.input_format = self.input_extension.replace(".", "")
The function set_input takes a file name as input, extracts the name and extension, and sets
attributes in the class accordingly.
Parameters
- input: The
set_inputmethod in the provided code snippet is used to set attributes related to the input file. Here's a breakdown of the parameters and their usage in the method:
107 def set_config(self, config: dict) -> None: 108 """ 109 The set_config function takes a config object and assigns it as the configuration object for the 110 class. 111 112 :param config: The `config` parameter in the `set_config` function is a dictionary object that 113 contains configuration settings for the class. When you call the `set_config` function with a 114 dictionary object as the argument, it will set that dictionary as the configuration object for 115 the class 116 :type config: dict 117 """ 118 119 self.config = config
The set_config function takes a config object and assigns it as the configuration object for the class.
Parameters
- config: The
configparameter in theset_configfunction is a dictionary object that contains configuration settings for the class. When you call theset_configfunction with a dictionary object as the argument, it will set that dictionary as the configuration object for the class
121 def set_param(self, param: dict) -> None: 122 """ 123 This function sets a parameter object for the class based on the input dictionary. 124 125 :param param: The `set_param` method you provided takes a dictionary object as input and sets it 126 as the `param` attribute of the class instance 127 :type param: dict 128 """ 129 130 self.param = param
This function sets a parameter object for the class based on the input dictionary.
Parameters
- param: The
set_parammethod you provided takes a dictionary object as input and sets it as theparamattribute of the class instance
132 def init_variables(self) -> None: 133 """ 134 This function initializes the variables that will be used in the rest of the class 135 """ 136 137 self.prefix = "howard" 138 self.table_variants = "variants" 139 self.dataframe = None 140 141 self.comparison_map = { 142 "gt": ">", 143 "gte": ">=", 144 "lt": "<", 145 "lte": "<=", 146 "equals": "=", 147 "contains": "SIMILAR TO", 148 } 149 150 self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3} 151 152 self.code_type_map_to_sql = { 153 "Integer": "INTEGER", 154 "String": "VARCHAR", 155 "Float": "FLOAT", 156 "Flag": "VARCHAR", 157 } 158 159 self.index_additionnal_fields = []
This function initializes the variables that will be used in the rest of the class
161 def get_indexing(self) -> bool: 162 """ 163 It returns the value of the key "indexing" in the dictionary. If the key is not present, it 164 returns False. 165 :return: The value of the indexing parameter. 166 """ 167 168 return self.get_param().get("indexing", False)
It returns the value of the key "indexing" in the dictionary. If the key is not present, it returns False.
Returns
The value of the indexing parameter.
170 def get_connexion_config(self) -> dict: 171 """ 172 The function `get_connexion_config` returns a dictionary containing the configuration for a 173 connection, including the number of threads and memory limit. 174 :return: a dictionary containing the configuration for the Connexion library. 175 """ 176 177 # config 178 config = self.get_config() 179 180 # Connexion config 181 connexion_config = {} 182 threads = self.get_threads() 183 184 # Threads 185 if threads: 186 connexion_config["threads"] = threads 187 188 # Memory 189 # if config.get("memory", None): 190 # connexion_config["memory_limit"] = config.get("memory") 191 if self.get_memory(): 192 connexion_config["memory_limit"] = self.get_memory() 193 194 # Temporary directory 195 if config.get("tmp", None): 196 connexion_config["temp_directory"] = config.get("tmp") 197 198 # Access 199 if config.get("access", None): 200 access = config.get("access") 201 if access in ["RO"]: 202 access = "READ_ONLY" 203 elif access in ["RW"]: 204 access = "READ_WRITE" 205 connexion_db = self.get_connexion_db() 206 if connexion_db in ":memory:": 207 access = "READ_WRITE" 208 connexion_config["access_mode"] = access 209 210 return connexion_config
The function get_connexion_config returns a dictionary containing the configuration for a
connection, including the number of threads and memory limit.
Returns
a dictionary containing the configuration for the Connexion library.
212 def get_duckdb_settings(self) -> dict: 213 """ 214 The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a 215 string. 216 :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`. 217 """ 218 219 # config 220 config = self.get_config() 221 222 # duckdb settings 223 duckdb_settings_dict = {} 224 if config.get("duckdb_settings", None): 225 duckdb_settings = config.get("duckdb_settings") 226 duckdb_settings = full_path(duckdb_settings) 227 # duckdb setting is a file 228 if os.path.exists(duckdb_settings): 229 with open(duckdb_settings) as json_file: 230 duckdb_settings_dict = yaml.safe_load(json_file) 231 # duckdb settings is a string 232 else: 233 duckdb_settings_dict = json.loads(duckdb_settings) 234 235 return duckdb_settings_dict
The function get_duckdb_settings retrieves DuckDB settings from a configuration file or a
string.
Returns
The function
get_duckdb_settingsreturns a dictionary objectduckdb_settings_dict.
237 def set_connexion_db(self) -> str: 238 """ 239 The function `set_connexion_db` returns the appropriate database connection string based on the 240 input format and connection type. 241 :return: the value of the variable `connexion_db`. 242 """ 243 244 # Default connexion db 245 default_connexion_db = ":memory:" 246 247 # Find connexion db 248 if self.get_input_format() in ["db", "duckdb"]: 249 connexion_db = self.get_input() 250 elif self.get_connexion_type() in ["memory", default_connexion_db, None]: 251 connexion_db = default_connexion_db 252 elif self.get_connexion_type() in ["tmpfile"]: 253 tmp_name = tempfile.mkdtemp( 254 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db" 255 ) 256 connexion_db = f"{tmp_name}/tmp.db" 257 elif self.get_connexion_type() != "": 258 connexion_db = self.get_connexion_type() 259 else: 260 connexion_db = default_connexion_db 261 262 # Set connexion db 263 self.connexion_db = connexion_db 264 265 return connexion_db
The function set_connexion_db returns the appropriate database connection string based on the
input format and connection type.
Returns
the value of the variable
connexion_db.
267 def set_connexion(self, conn) -> None: 268 """ 269 The function `set_connexion` creates a connection to a database, with options for different 270 database formats and settings. 271 272 :param conn: The `conn` parameter in the `set_connexion` method is the connection to the 273 database. If a connection is not provided, a new connection to an in-memory database is created. 274 The method then proceeds to set up the connection based on the specified format (e.g., duckdb or 275 sqlite 276 """ 277 278 # Connexion db 279 connexion_db = self.set_connexion_db() 280 281 # Connexion config 282 connexion_config = self.get_connexion_config() 283 284 # Connexion format 285 connexion_format = self.get_config().get("connexion_format", "duckdb") 286 # Set connexion format 287 self.connexion_format = connexion_format 288 289 # Connexion 290 if not conn: 291 if connexion_format in ["duckdb"]: 292 conn = duckdb.connect(connexion_db, config=connexion_config) 293 # duckDB settings 294 duckdb_settings = self.get_duckdb_settings() 295 if duckdb_settings: 296 for setting in duckdb_settings: 297 setting_value = duckdb_settings.get(setting) 298 if isinstance(setting_value, str): 299 setting_value = f"'{setting_value}'" 300 conn.execute(f"PRAGMA {setting}={setting_value};") 301 elif connexion_format in ["sqlite"]: 302 conn = sqlite3.connect(connexion_db) 303 304 # Set connexion 305 self.conn = conn 306 307 # Log 308 log.debug(f"connexion_format: {connexion_format}") 309 log.debug(f"connexion_db: {connexion_db}") 310 log.debug(f"connexion config: {connexion_config}") 311 log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")
The function set_connexion creates a connection to a database, with options for different
database formats and settings.
Parameters
- conn: The
connparameter in theset_connexionmethod is the connection to the database. If a connection is not provided, a new connection to an in-memory database is created. The method then proceeds to set up the connection based on the specified format (e.g., duckdb or sqlite
313 def set_output(self, output: str = None) -> None: 314 """ 315 The `set_output` function in Python sets the output file based on the input or a specified key 316 in the config file, extracting the output name, extension, and format. 317 318 :param output: The `output` parameter in the `set_output` method is used to specify the name of 319 the output file. If the config file has an 'output' key, the method sets the output to the value 320 of that key. If no output is provided, it sets the output to `None` 321 :type output: str 322 """ 323 324 if output and not isinstance(output, str): 325 self.output = output.name 326 else: 327 self.output = output 328 329 # Output format 330 if self.output: 331 output_name, output_extension = os.path.splitext(self.output) 332 self.output_name = output_name 333 self.output_extension = output_extension 334 self.output_format = self.output_extension.replace(".", "") 335 else: 336 self.output_name = None 337 self.output_extension = None 338 self.output_format = None
The set_output function in Python sets the output file based on the input or a specified key
in the config file, extracting the output name, extension, and format.
Parameters
- output: The
outputparameter in theset_outputmethod is used to specify the name of the output file. If the config file has an 'output' key, the method sets the output to the value of that key. If no output is provided, it sets the output toNone
340 def set_header(self) -> None: 341 """ 342 It reads the header of a VCF file and stores it as a list of strings and as a VCF object 343 """ 344 345 input_file = self.get_input() 346 default_header_list = [ 347 "##fileformat=VCFv4.2", 348 "#CHROM POS ID REF ALT QUAL FILTER INFO", 349 ] 350 351 # Full path 352 input_file = full_path(input_file) 353 354 if input_file: 355 356 input_format = self.get_input_format() 357 input_compressed = self.get_input_compressed() 358 config = self.get_config() 359 header_list = default_header_list 360 if input_format in [ 361 "vcf", 362 "hdr", 363 "tsv", 364 "csv", 365 "psv", 366 "parquet", 367 "db", 368 "duckdb", 369 ]: 370 # header provided in param 371 if config.get("header_file", None): 372 with open(config.get("header_file"), "rt") as f: 373 header_list = self.read_vcf_header(f) 374 # within a vcf file format (header within input file itsself) 375 elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file): 376 # within a compressed vcf file format (.vcf.gz) 377 if input_compressed: 378 with bgzf.open(input_file, "rt") as f: 379 header_list = self.read_vcf_header(f) 380 # within an uncompressed vcf file format (.vcf) 381 else: 382 with open(input_file, "rt") as f: 383 header_list = self.read_vcf_header(f) 384 # header provided in default external file .hdr 385 elif os.path.exists((input_file + ".hdr")): 386 with open(input_file + ".hdr", "rt") as f: 387 header_list = self.read_vcf_header(f) 388 else: 389 try: # Try to get header info fields and file columns 390 391 with tempfile.TemporaryDirectory() as tmpdir: 392 393 # Create database 394 db_for_header = Database(database=input_file) 395 396 # Get header columns for infos fields 397 db_header_from_columns = ( 398 db_for_header.get_header_from_columns() 399 ) 400 401 # Get real columns in the file 402 db_header_columns = db_for_header.get_columns() 403 404 # Write header file 405 header_file_tmp = os.path.join(tmpdir, "header") 406 f = open(header_file_tmp, "w") 407 vcf.Writer(f, db_header_from_columns) 408 f.close() 409 410 # Replace #CHROM line with rel columns 411 header_list = db_for_header.read_header_file( 412 header_file=header_file_tmp 413 ) 414 header_list[-1] = "\t".join(db_header_columns) 415 416 except: 417 418 log.warning( 419 f"No header for file {input_file}. Set as default VCF header" 420 ) 421 header_list = default_header_list 422 423 else: # try for unknown format ? 424 425 log.error(f"Input file format '{input_format}' not available") 426 raise ValueError(f"Input file format '{input_format}' not available") 427 428 if not header_list: 429 header_list = default_header_list 430 431 # header as list 432 self.header_list = header_list 433 434 # header as VCF object 435 self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list))) 436 437 else: 438 439 self.header_list = None 440 self.header_vcf = None
It reads the header of a VCF file and stores it as a list of strings and as a VCF object
442 def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame: 443 """ 444 The `get_query_to_df` function takes a query as a string and returns the result as a pandas 445 DataFrame based on the connection format. 446 447 :param query: The `query` parameter in the `get_query_to_df` function is a string that 448 represents the SQL query you want to execute. This query will be used to fetch data from a 449 database and convert it into a pandas DataFrame 450 :type query: str 451 :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the 452 maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the 453 function will only fetch up to that number of rows from the database query result. If no limit 454 is specified, 455 :type limit: int 456 :return: A pandas DataFrame is being returned by the `get_query_to_df` function. 457 """ 458 459 # Connexion format 460 connexion_format = self.get_connexion_format() 461 462 # Limit in query 463 if limit: 464 pd.set_option("display.max_rows", limit) 465 if connexion_format in ["duckdb"]: 466 df = ( 467 self.conn.execute(query) 468 .fetch_record_batch(limit) 469 .read_next_batch() 470 .to_pandas() 471 ) 472 elif connexion_format in ["sqlite"]: 473 df = next(pd.read_sql_query(query, self.conn, chunksize=limit)) 474 475 # Full query 476 else: 477 if connexion_format in ["duckdb"]: 478 df = self.conn.execute(query).df() 479 elif connexion_format in ["sqlite"]: 480 df = pd.read_sql_query(query, self.conn) 481 482 return df
The get_query_to_df function takes a query as a string and returns the result as a pandas
DataFrame based on the connection format.
Parameters
- query: The
queryparameter in theget_query_to_dffunction is a string that represents the SQL query you want to execute. This query will be used to fetch data from a database and convert it into a pandas DataFrame - limit: The
limitparameter in theget_query_to_dffunction is used to specify the maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the function will only fetch up to that number of rows from the database query result. If no limit is specified,
Returns
A pandas DataFrame is being returned by the
get_query_to_dffunction.
484 def get_overview(self) -> None: 485 """ 486 The function prints the input, output, config, and dataframe of the current object 487 """ 488 table_variants_from = self.get_table_variants(clause="from") 489 sql_columns = self.get_header_columns_as_sql() 490 sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}" 491 df = self.get_query_to_df(sql_query_export) 492 log.info( 493 "Input: " 494 + str(self.get_input()) 495 + " [" 496 + str(str(self.get_input_format())) 497 + "]" 498 ) 499 log.info( 500 "Output: " 501 + str(self.get_output()) 502 + " [" 503 + str(str(self.get_output_format())) 504 + "]" 505 ) 506 log.info("Config: ") 507 for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split( 508 "\n" 509 ): 510 log.info("\t" + str(d)) 511 log.info("Param: ") 512 for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split( 513 "\n" 514 ): 515 log.info("\t" + str(d)) 516 log.info("Sample list: " + str(self.get_header_sample_list())) 517 log.info("Dataframe: ") 518 for d in str(df).split("\n"): 519 log.info("\t" + str(d)) 520 521 # garbage collector 522 del df 523 gc.collect() 524 525 return None
The function prints the input, output, config, and dataframe of the current object
527 def get_stats(self) -> dict: 528 """ 529 The `get_stats` function calculates and returns various statistics of the current object, 530 including information about the input file, variants, samples, header fields, quality, and 531 SNVs/InDels. 532 :return: a dictionary containing various statistics of the current object. The dictionary has 533 the following structure: 534 """ 535 536 # Log 537 log.info(f"Stats Calculation...") 538 539 # table varaints 540 table_variants_from = self.get_table_variants() 541 542 # stats dict 543 stats = {"Infos": {}} 544 545 ### File 546 input_file = self.get_input() 547 stats["Infos"]["Input file"] = input_file 548 549 # Header 550 header_infos = self.get_header().infos 551 header_formats = self.get_header().formats 552 header_infos_list = list(header_infos) 553 header_formats_list = list(header_formats) 554 555 ### Variants 556 557 stats["Variants"] = {} 558 559 # Variants by chr 560 sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"' 561 df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom) 562 nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values( 563 by=["CHROM"], kind="quicksort" 564 ) 565 566 # Total number of variants 567 nb_of_variants = nb_of_variants_by_chrom["count"].sum() 568 569 # Calculate percentage 570 nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply( 571 lambda x: (x / nb_of_variants) 572 ) 573 574 stats["Variants"]["Number of variants by chromosome"] = ( 575 nb_of_variants_by_chrom.to_dict(orient="index") 576 ) 577 578 stats["Infos"]["Number of variants"] = int(nb_of_variants) 579 580 ### Samples 581 582 # Init 583 samples = {} 584 nb_of_samples = 0 585 586 # Check Samples 587 if "GT" in header_formats_list and "FORMAT" in self.get_header_columns(): 588 log.debug(f"Check samples...") 589 for sample in self.get_header_sample_list(): 590 sql_query_samples = f""" 591 SELECT '{sample}' as sample, 592 REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype, 593 count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count, 594 concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage 595 FROM {table_variants_from} 596 WHERE ( 597 regexp_matches("{sample}", '^[0-9]([/|][0-9])+') 598 AND 599 len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':')) 600 ) 601 GROUP BY genotype 602 """ 603 sql_query_genotype_df = self.conn.execute(sql_query_samples).df() 604 sample_genotype_count = sql_query_genotype_df["count"].sum() 605 if len(sql_query_genotype_df): 606 nb_of_samples += 1 607 samples[f"{sample} - {sample_genotype_count} variants"] = ( 608 sql_query_genotype_df.to_dict(orient="index") 609 ) 610 611 stats["Samples"] = samples 612 stats["Infos"]["Number of samples"] = nb_of_samples 613 614 # # 615 # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list: 616 # stats["Infos"]["Number of samples"] = nb_of_samples 617 # elif nb_of_samples: 618 # stats["Infos"]["Number of samples"] = "not a VCF format" 619 620 ### INFO and FORMAT fields 621 header_types_df = {} 622 header_types_list = { 623 "List of INFO fields": header_infos, 624 "List of FORMAT fields": header_formats, 625 } 626 i = 0 627 for header_type in header_types_list: 628 629 header_type_infos = header_types_list.get(header_type) 630 header_infos_dict = {} 631 632 for info in header_type_infos: 633 634 i += 1 635 header_infos_dict[i] = {} 636 637 # ID 638 header_infos_dict[i]["id"] = info 639 640 # num 641 genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"} 642 if header_type_infos[info].num in genotype_map.keys(): 643 header_infos_dict[i]["Number"] = genotype_map.get( 644 header_type_infos[info].num 645 ) 646 else: 647 header_infos_dict[i]["Number"] = header_type_infos[info].num 648 649 # type 650 if header_type_infos[info].type: 651 header_infos_dict[i]["Type"] = header_type_infos[info].type 652 else: 653 header_infos_dict[i]["Type"] = "." 654 655 # desc 656 if header_type_infos[info].desc != None: 657 header_infos_dict[i]["Description"] = header_type_infos[info].desc 658 else: 659 header_infos_dict[i]["Description"] = "" 660 661 if len(header_infos_dict): 662 header_types_df[header_type] = pd.DataFrame.from_dict( 663 header_infos_dict, orient="index" 664 ).to_dict(orient="index") 665 666 # Stats 667 stats["Infos"]["Number of INFO fields"] = len(header_infos_list) 668 stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list) 669 stats["Header"] = header_types_df 670 671 ### QUAL 672 if "QUAL" in self.get_header_columns(): 673 sql_query_qual = f""" 674 SELECT 675 avg(CAST(QUAL AS INTEGER)) AS Average, 676 min(CAST(QUAL AS INTEGER)) AS Minimum, 677 max(CAST(QUAL AS INTEGER)) AS Maximum, 678 stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation, 679 median(CAST(QUAL AS INTEGER)) AS Median, 680 variance(CAST(QUAL AS INTEGER)) AS Variance 681 FROM {table_variants_from} 682 WHERE CAST(QUAL AS VARCHAR) NOT IN ('.') 683 """ 684 685 qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index") 686 stats["Quality"] = {"Stats": qual} 687 688 ### SNV and InDel 689 690 sql_query_snv = f""" 691 692 SELECT Type, count FROM ( 693 694 SELECT 695 'Total' AS Type, 696 count(*) AS count 697 FROM {table_variants_from} 698 699 UNION 700 701 SELECT 702 'MNV' AS Type, 703 count(*) AS count 704 FROM {table_variants_from} 705 WHERE len(REF) > 1 AND len(ALT) > 1 706 AND len(REF) = len(ALT) 707 708 UNION 709 710 SELECT 711 'InDel' AS Type, 712 count(*) AS count 713 FROM {table_variants_from} 714 WHERE len(REF) > 1 OR len(ALT) > 1 715 AND len(REF) != len(ALT) 716 717 UNION 718 719 SELECT 720 'SNV' AS Type, 721 count(*) AS count 722 FROM {table_variants_from} 723 WHERE len(REF) = 1 AND len(ALT) = 1 724 725 ) 726 727 ORDER BY count DESC 728 729 """ 730 snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index") 731 732 sql_query_snv_substitution = f""" 733 SELECT 734 concat(REF, '>', ALT) AS 'Substitution', 735 count(*) AS count 736 FROM {table_variants_from} 737 WHERE len(REF) = 1 AND len(ALT) = 1 738 GROUP BY REF, ALT 739 ORDER BY count(*) DESC 740 """ 741 snv_substitution = ( 742 self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index") 743 ) 744 stats["Variants"]["Counts"] = snv_indel 745 stats["Variants"]["Substitutions"] = snv_substitution 746 747 return stats
The get_stats function calculates and returns various statistics of the current object,
including information about the input file, variants, samples, header fields, quality, and
SNVs/InDels.
Returns
a dictionary containing various statistics of the current object. The dictionary has the following structure:
749 def stats_to_file(self, file: str = None) -> str: 750 """ 751 The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them 752 into a JSON object, and writes the JSON object to the specified file. 753 754 :param file: The `file` parameter is a string that represents the file path where the JSON data 755 will be written 756 :type file: str 757 :return: the name of the file that was written to. 758 """ 759 760 # Get stats 761 stats = self.get_stats() 762 763 # Serializing json 764 json_object = json.dumps(stats, indent=4) 765 766 # Writing to sample.json 767 with open(file, "w") as outfile: 768 outfile.write(json_object) 769 770 return file
The function stats_to_file takes a file name as input, retrieves statistics, serializes them
into a JSON object, and writes the JSON object to the specified file.
Parameters
- file: The
fileparameter is a string that represents the file path where the JSON data will be written
Returns
the name of the file that was written to.
772 def print_stats(self, output_file: str = None, json_file: str = None) -> None: 773 """ 774 The `print_stats` function generates a markdown file and prints the statistics contained in a 775 JSON file in a formatted manner. 776 777 :param output_file: The `output_file` parameter is a string that specifies the path and filename 778 of the output file where the stats will be printed in Markdown format. If no `output_file` is 779 provided, a temporary directory will be created and the stats will be saved in a file named 780 "stats.md" within that 781 :type output_file: str 782 :param json_file: The `json_file` parameter is a string that represents the path to the JSON 783 file where the statistics will be saved. If no value is provided, a temporary directory will be 784 created and a default file name "stats.json" will be used 785 :type json_file: str 786 :return: The function `print_stats` does not return any value. It has a return type annotation 787 of `None`. 788 """ 789 790 # Full path 791 output_file = full_path(output_file) 792 json_file = full_path(json_file) 793 794 with tempfile.TemporaryDirectory() as tmpdir: 795 796 # Files 797 if not output_file: 798 output_file = os.path.join(tmpdir, "stats.md") 799 if not json_file: 800 json_file = os.path.join(tmpdir, "stats.json") 801 802 # Create folders 803 if not os.path.exists(os.path.dirname(output_file)): 804 Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True) 805 if not os.path.exists(os.path.dirname(json_file)): 806 Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True) 807 808 # Create stats JSON file 809 stats_file = self.stats_to_file(file=json_file) 810 811 # Print stats file 812 with open(stats_file) as f: 813 stats = yaml.safe_load(f) 814 815 # Output 816 output_title = [] 817 output_index = [] 818 output = [] 819 820 # Title 821 output_title.append("# HOWARD Stats") 822 823 # Index 824 output_index.append("## Index") 825 826 # Process sections 827 for section in stats: 828 infos = stats.get(section) 829 section_link = "#" + section.lower().replace(" ", "-") 830 output.append(f"## {section}") 831 output_index.append(f"- [{section}]({section_link})") 832 833 if len(infos): 834 for info in infos: 835 try: 836 df = pd.DataFrame.from_dict(infos.get(info), orient="index") 837 is_df = True 838 except: 839 try: 840 df = pd.DataFrame.from_dict( 841 json.loads((infos.get(info))), orient="index" 842 ) 843 is_df = True 844 except: 845 is_df = False 846 if is_df: 847 output.append(f"### {info}") 848 info_link = "#" + info.lower().replace(" ", "-") 849 output_index.append(f" - [{info}]({info_link})") 850 output.append(f"{df.to_markdown(index=False)}") 851 else: 852 output.append(f"- {info}: {infos.get(info)}") 853 else: 854 output.append(f"NA") 855 856 # Write stats in markdown file 857 with open(output_file, "w") as fp: 858 for item in output_title: 859 fp.write("%s\n" % item) 860 for item in output_index: 861 fp.write("%s\n" % item) 862 for item in output: 863 fp.write("%s\n" % item) 864 865 # Output stats in markdown 866 print("") 867 print("\n\n".join(output_title)) 868 print("") 869 print("\n\n".join(output)) 870 print("") 871 872 return None
The print_stats function generates a markdown file and prints the statistics contained in a
JSON file in a formatted manner.
Parameters
- output_file: The
output_fileparameter is a string that specifies the path and filename of the output file where the stats will be printed in Markdown format. If nooutput_fileis provided, a temporary directory will be created and the stats will be saved in a file named "stats.md" within that - json_file: The
json_fileparameter is a string that represents the path to the JSON file where the statistics will be saved. If no value is provided, a temporary directory will be created and a default file name "stats.json" will be used
Returns
The function
print_statsdoes not return any value. It has a return type annotation ofNone.
874 def get_input(self) -> str: 875 """ 876 It returns the value of the input variable. 877 :return: The input is being returned. 878 """ 879 return self.input
It returns the value of the input variable.
Returns
The input is being returned.
881 def get_input_format(self, input_file: str = None) -> str: 882 """ 883 This function returns the format of the input variable, either from the provided input file or 884 by prompting for input. 885 886 :param input_file: The `input_file` parameter in the `get_input_format` method is a string that 887 represents the file path of the input file. If no `input_file` is provided when calling the 888 method, it will default to `None` 889 :type input_file: str 890 :return: The format of the input variable is being returned. 891 """ 892 893 if not input_file: 894 input_file = self.get_input() 895 input_format = get_file_format(input_file) 896 return input_format
This function returns the format of the input variable, either from the provided input file or by prompting for input.
Parameters
- input_file: The
input_fileparameter in theget_input_formatmethod is a string that represents the file path of the input file. If noinput_fileis provided when calling the method, it will default toNone
Returns
The format of the input variable is being returned.
898 def get_input_compressed(self, input_file: str = None) -> str: 899 """ 900 The function `get_input_compressed` returns the format of the input variable after compressing 901 it. 902 903 :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string 904 that represents the file path of the input file. If no `input_file` is provided when calling the 905 method, it will default to `None` and the method will then call `self.get_input()` to 906 :type input_file: str 907 :return: The function `get_input_compressed` returns the compressed format of the input 908 variable. 909 """ 910 911 if not input_file: 912 input_file = self.get_input() 913 input_compressed = get_file_compressed(input_file) 914 return input_compressed
The function get_input_compressed returns the format of the input variable after compressing
it.
Parameters
- input_file: The
input_fileparameter in theget_input_compressedmethod is a string that represents the file path of the input file. If noinput_fileis provided when calling the method, it will default toNoneand the method will then callself.get_input()to
Returns
The function
get_input_compressedreturns the compressed format of the input variable.
916 def get_output(self) -> str: 917 """ 918 It returns the output of the neuron. 919 :return: The output of the neural network. 920 """ 921 922 return self.output
It returns the output of the neuron.
Returns
The output of the neural network.
924 def get_output_format(self, output_file: str = None) -> str: 925 """ 926 The function `get_output_format` returns the format of the input variable or the output file if 927 provided. 928 929 :param output_file: The `output_file` parameter in the `get_output_format` method is a string 930 that represents the file path of the output file. If no `output_file` is provided when calling 931 the method, it will default to the output obtained from the `get_output` method of the class 932 instance. The 933 :type output_file: str 934 :return: The format of the input variable is being returned. 935 """ 936 937 if not output_file: 938 output_file = self.get_output() 939 output_format = get_file_format(output_file) 940 941 return output_format
The function get_output_format returns the format of the input variable or the output file if
provided.
Parameters
- output_file: The
output_fileparameter in theget_output_formatmethod is a string that represents the file path of the output file. If nooutput_fileis provided when calling the method, it will default to the output obtained from theget_outputmethod of the class instance. The
Returns
The format of the input variable is being returned.
943 def get_config(self) -> dict: 944 """ 945 It returns the config 946 :return: The config variable is being returned. 947 """ 948 return self.config
It returns the config
Returns
The config variable is being returned.
950 def get_param(self) -> dict: 951 """ 952 It returns the param 953 :return: The param variable is being returned. 954 """ 955 return self.param
It returns the param
Returns
The param variable is being returned.
957 def get_connexion_db(self) -> str: 958 """ 959 It returns the connexion_db attribute of the object 960 :return: The connexion_db is being returned. 961 """ 962 return self.connexion_db
It returns the connexion_db attribute of the object
Returns
The connexion_db is being returned.
964 def get_prefix(self) -> str: 965 """ 966 It returns the prefix of the object. 967 :return: The prefix is being returned. 968 """ 969 return self.prefix
It returns the prefix of the object.
Returns
The prefix is being returned.
971 def get_table_variants(self, clause: str = "select") -> str: 972 """ 973 This function returns the table_variants attribute of the object 974 975 :param clause: the type of clause the table will be used. Either "select" or "from" (optional), 976 defaults to select (optional) 977 :return: The table_variants attribute of the object. 978 """ 979 980 # Access 981 access = self.get_config().get("access", None) 982 983 # Clauses "select", "where", "update" 984 if clause in ["select", "where", "update"]: 985 table_variants = self.table_variants 986 # Clause "from" 987 elif clause in ["from"]: 988 # For Read Only 989 if self.get_input_format() in ["parquet"] and access in ["RO"]: 990 input_file = self.get_input() 991 table_variants = f"'{input_file}' as variants" 992 # For Read Write 993 else: 994 table_variants = f"{self.table_variants} as variants" 995 else: 996 table_variants = self.table_variants 997 return table_variants
This function returns the table_variants attribute of the object
Parameters
- clause: the type of clause the table will be used. Either "select" or "from" (optional), defaults to select (optional)
Returns
The table_variants attribute of the object.
999 def get_tmp_dir(self) -> str: 1000 """ 1001 The function `get_tmp_dir` returns the temporary directory path based on configuration 1002 parameters or a default path. 1003 :return: The `get_tmp_dir` method is returning the temporary directory path based on the 1004 configuration, parameters, and a default value of "/tmp". 1005 """ 1006 1007 return get_tmp( 1008 config=self.get_config(), param=self.get_param(), default_tmp="/tmp" 1009 )
The function get_tmp_dir returns the temporary directory path based on configuration
parameters or a default path.
Returns
The
get_tmp_dirmethod is returning the temporary directory path based on the configuration, parameters, and a default value of "/tmp".
1011 def get_connexion_type(self) -> str: 1012 """ 1013 If the connexion type is not in the list of allowed connexion types, raise a ValueError 1014 1015 :return: The connexion type is being returned. 1016 """ 1017 return self.get_config().get("connexion_type", "memory")
If the connexion type is not in the list of allowed connexion types, raise a ValueError
Returns
The connexion type is being returned.
1019 def get_connexion(self): 1020 """ 1021 It returns the connection object 1022 1023 :return: The connection object. 1024 """ 1025 return self.conn
It returns the connection object
Returns
The connection object.
1027 def close_connexion(self) -> None: 1028 """ 1029 This function closes the connection to the database. 1030 :return: The connection is being closed. 1031 """ 1032 return self.conn.close()
This function closes the connection to the database.
Returns
The connection is being closed.
1034 def get_header(self, type: str = "vcf"): 1035 """ 1036 This function returns the header of the VCF file as a list of strings 1037 1038 :param type: the type of header you want to get, defaults to vcf (optional) 1039 :return: The header of the vcf file. 1040 """ 1041 1042 if self.header_vcf: 1043 if type == "vcf": 1044 return self.header_vcf 1045 elif type == "list": 1046 return self.header_list 1047 else: 1048 if type == "vcf": 1049 header = vcf.Reader(io.StringIO("\n".join(vcf_required))) 1050 return header 1051 elif type == "list": 1052 return vcf_required
This function returns the header of the VCF file as a list of strings
Parameters
- type: the type of header you want to get, defaults to vcf (optional)
Returns
The header of the vcf file.
1054 def get_header_length(self, file: str = None) -> int: 1055 """ 1056 The function `get_header_length` returns the length of the header list, excluding the #CHROM 1057 line. 1058 1059 :param file: The `file` parameter is an optional argument that specifies the path to a VCF 1060 header file. If this argument is provided, the function will read the header from the specified 1061 file and return the length of the header list minus 1 (to exclude the #CHROM line) 1062 :type file: str 1063 :return: the length of the header list, excluding the #CHROM line. 1064 """ 1065 1066 if file: 1067 return len(self.read_vcf_header_file(file=file)) - 1 1068 elif self.get_header(type="list"): 1069 return len(self.get_header(type="list")) - 1 1070 else: 1071 return 0
The function get_header_length returns the length of the header list, excluding the #CHROM
line.
Parameters
- file: The
fileparameter is an optional argument that specifies the path to a VCF header file. If this argument is provided, the function will read the header from the specified file and return the length of the header list minus 1 (to exclude the #CHROM line)
Returns
the length of the header list, excluding the #CHROM line.
1073 def get_header_columns(self) -> str: 1074 """ 1075 This function returns the header list of a VCF 1076 1077 :return: The length of the header list. 1078 """ 1079 if self.get_header(): 1080 return self.get_header(type="list")[-1] 1081 else: 1082 return ""
This function returns the header list of a VCF
Returns
The length of the header list.
1084 def get_header_columns_as_list(self) -> list: 1085 """ 1086 This function returns the header list of a VCF 1087 1088 :return: The length of the header list. 1089 """ 1090 if self.get_header(): 1091 return self.get_header_columns().strip().split("\t") 1092 else: 1093 return []
This function returns the header list of a VCF
Returns
The length of the header list.
1095 def get_header_columns_as_sql(self) -> str: 1096 """ 1097 This function retruns header length (without #CHROM line) 1098 1099 :return: The length of the header list. 1100 """ 1101 sql_column_list = [] 1102 for col in self.get_header_columns_as_list(): 1103 sql_column_list.append(f'"{col}"') 1104 return ",".join(sql_column_list)
This function retruns header length (without #CHROM line)
Returns
The length of the header list.
1106 def get_header_sample_list(self) -> list: 1107 """ 1108 This function retruns header length (without #CHROM line) 1109 1110 :return: The length of the header list. 1111 """ 1112 return self.header_vcf.samples
This function retruns header length (without #CHROM line)
Returns
The length of the header list.
1114 def get_verbose(self) -> bool: 1115 """ 1116 It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't 1117 exist 1118 1119 :return: The value of the key "verbose" in the config dictionary. 1120 """ 1121 return self.get_config().get("verbose", False)
It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't exist
Returns
The value of the key "verbose" in the config dictionary.
1123 def get_connexion_format(self) -> str: 1124 """ 1125 It returns the connexion format of the object. 1126 :return: The connexion_format is being returned. 1127 """ 1128 connexion_format = self.connexion_format 1129 if connexion_format not in ["duckdb", "sqlite"]: 1130 log.error(f"Unknown connexion format {connexion_format}") 1131 raise ValueError(f"Unknown connexion format {connexion_format}") 1132 else: 1133 return connexion_format
It returns the connexion format of the object.
Returns
The connexion_format is being returned.
1135 def insert_file_to_table( 1136 self, 1137 file, 1138 columns: str, 1139 header_len: int = 0, 1140 sep: str = "\t", 1141 chunksize: int = 1000000, 1142 ) -> None: 1143 """ 1144 The function reads a file in chunks and inserts each chunk into a table based on the specified 1145 database format. 1146 1147 :param file: The `file` parameter is the file that you want to load into a table. It should be 1148 the path to the file on your system 1149 :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that 1150 should contain the names of the columns in the table where the data will be inserted. The column 1151 names should be separated by commas within the string. For example, if you have columns named 1152 "id", "name 1153 :type columns: str 1154 :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies 1155 the number of lines to skip at the beginning of the file before reading the actual data. This 1156 parameter allows you to skip any header information present in the file before processing the 1157 data, defaults to 0 1158 :type header_len: int (optional) 1159 :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the 1160 separator character that is used in the file being read. In this case, the default separator is 1161 set to `\t`, which represents a tab character. You can change this parameter to a different 1162 separator character if, defaults to \t 1163 :type sep: str (optional) 1164 :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time 1165 when processing the file in chunks. In the provided code snippet, the default value for 1166 `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults 1167 to 1000000 1168 :type chunksize: int (optional) 1169 """ 1170 1171 # Config 1172 chunksize = self.get_config().get("load", {}).get("chunk", chunksize) 1173 connexion_format = self.get_connexion_format() 1174 1175 log.debug("chunksize: " + str(chunksize)) 1176 1177 if chunksize: 1178 for chunk in pd.read_csv( 1179 file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c" 1180 ): 1181 if connexion_format in ["duckdb"]: 1182 sql_insert_into = ( 1183 f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk" 1184 ) 1185 self.conn.execute(sql_insert_into) 1186 elif connexion_format in ["sqlite"]: 1187 chunk.to_sql("variants", self.conn, if_exists="append", index=False)
The function reads a file in chunks and inserts each chunk into a table based on the specified database format.
Parameters
- file: The
fileparameter is the file that you want to load into a table. It should be the path to the file on your system - columns: The
columnsparameter in theinsert_file_to_tablefunction is a string that should contain the names of the columns in the table where the data will be inserted. The column names should be separated by commas within the string. For example, if you have columns named "id", "name - header_len: The
header_lenparameter in theinsert_file_to_tablefunction specifies the number of lines to skip at the beginning of the file before reading the actual data. This parameter allows you to skip any header information present in the file before processing the data, defaults to 0 - sep: The
sepparameter in theinsert_file_to_tablefunction is used to specify the separator character that is used in the file being read. In this case, the default separator is set to, which represents a tab character. You can change this parameter to a different separator character if, defaults to - chunksize: The
chunksizeparameter specifies the number of rows to read in at a time when processing the file in chunks. In the provided code snippet, the default value forchunksizeis set to 1000000. This means that the file will be read in chunks of 1,, defaults to 1000000
1189 def load_data( 1190 self, 1191 input_file: str = None, 1192 drop_variants_table: bool = False, 1193 sample_size: int = 20480, 1194 ) -> None: 1195 """ 1196 The `load_data` function reads a VCF file and inserts it into a table, with options to drop the 1197 table before loading the data and specify a sample size. 1198 1199 :param input_file: The path to the input file. This is the VCF file that will be loaded into the 1200 table 1201 :type input_file: str 1202 :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that 1203 determines whether the variants table should be dropped before loading the data. If set to 1204 `True`, the variants table will be dropped. If set to `False` (default), the variants table will 1205 not be dropped, defaults to False 1206 :type drop_variants_table: bool (optional) 1207 :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from 1208 the input file. If it is set to `None`, the default value of 20480 will be used, defaults to 1209 20480 1210 :type sample_size: int (optional) 1211 """ 1212 1213 log.info("Loading...") 1214 1215 # change input file 1216 if input_file: 1217 self.set_input(input_file) 1218 self.set_header() 1219 1220 # drop variants table 1221 if drop_variants_table: 1222 self.drop_variants_table() 1223 1224 # get table variants 1225 table_variants = self.get_table_variants() 1226 1227 # Access 1228 access = self.get_config().get("access", None) 1229 log.debug(f"access: {access}") 1230 1231 # Input format and compress 1232 input_format = self.get_input_format() 1233 input_compressed = self.get_input_compressed() 1234 log.debug(f"input_format: {input_format}") 1235 log.debug(f"input_compressed: {input_compressed}") 1236 1237 # input_compressed_format 1238 if input_compressed: 1239 input_compressed_format = "gzip" 1240 else: 1241 input_compressed_format = "none" 1242 log.debug(f"input_compressed_format: {input_compressed_format}") 1243 1244 # Connexion format 1245 connexion_format = self.get_connexion_format() 1246 1247 # Sample size 1248 if not sample_size: 1249 sample_size = -1 1250 log.debug(f"sample_size: {sample_size}") 1251 1252 # Load data 1253 log.debug(f"Load Data from {input_format}") 1254 1255 # DuckDB connexion 1256 if connexion_format in ["duckdb"]: 1257 1258 # Database already exists 1259 if self.input_format in ["db", "duckdb"]: 1260 1261 if connexion_format in ["duckdb"]: 1262 log.debug(f"Input file format '{self.input_format}' duckDB") 1263 else: 1264 log.error( 1265 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1266 ) 1267 raise ValueError( 1268 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1269 ) 1270 1271 # Load from existing database format 1272 else: 1273 1274 try: 1275 # Create Table or View 1276 database = Database(database=self.input) 1277 sql_from = database.get_sql_from(sample_size=sample_size) 1278 1279 if access in ["RO"]: 1280 sql_load = ( 1281 f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}" 1282 ) 1283 else: 1284 sql_load = ( 1285 f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}" 1286 ) 1287 self.conn.execute(sql_load) 1288 1289 except: 1290 # Format not available 1291 log.error(f"Input file format '{self.input_format}' not available") 1292 raise ValueError( 1293 f"Input file format '{self.input_format}' not available" 1294 ) 1295 1296 # SQLite connexion 1297 elif connexion_format in ["sqlite"] and input_format in [ 1298 "vcf", 1299 "tsv", 1300 "csv", 1301 "psv", 1302 ]: 1303 1304 # Main structure 1305 structure = { 1306 "#CHROM": "VARCHAR", 1307 "POS": "INTEGER", 1308 "ID": "VARCHAR", 1309 "REF": "VARCHAR", 1310 "ALT": "VARCHAR", 1311 "QUAL": "VARCHAR", 1312 "FILTER": "VARCHAR", 1313 "INFO": "VARCHAR", 1314 } 1315 1316 # Strcuture with samples 1317 structure_complete = structure 1318 if self.get_header_sample_list(): 1319 structure["FORMAT"] = "VARCHAR" 1320 for sample in self.get_header_sample_list(): 1321 structure_complete[sample] = "VARCHAR" 1322 1323 # Columns list for create and insert 1324 sql_create_table_columns = [] 1325 sql_create_table_columns_list = [] 1326 for column in structure_complete: 1327 column_type = structure_complete[column] 1328 sql_create_table_columns.append( 1329 f'"{column}" {column_type} default NULL' 1330 ) 1331 sql_create_table_columns_list.append(f'"{column}"') 1332 1333 # Create database 1334 log.debug(f"Create Table {table_variants}") 1335 sql_create_table_columns_sql = ", ".join(sql_create_table_columns) 1336 sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list) 1337 sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})" 1338 self.conn.execute(sql_create_table) 1339 1340 # chunksize define length of file chunk load file 1341 chunksize = 100000 1342 1343 # delimiter 1344 delimiter = file_format_delimiters.get(input_format, "\t") 1345 1346 # Load the input file 1347 with open(self.input, "rt") as input_file: 1348 1349 # Use the appropriate file handler based on the input format 1350 if input_compressed: 1351 input_file = bgzf.open(self.input, "rt") 1352 if input_format in ["vcf"]: 1353 header_len = self.get_header_length() 1354 else: 1355 header_len = 0 1356 1357 # Insert the file contents into a table 1358 self.insert_file_to_table( 1359 input_file, 1360 columns=sql_create_table_columns_list_sql, 1361 header_len=header_len, 1362 sep=delimiter, 1363 chunksize=chunksize, 1364 ) 1365 1366 else: 1367 log.error( 1368 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1369 ) 1370 raise ValueError( 1371 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1372 ) 1373 1374 # Explode INFOS fields into table fields 1375 if self.get_explode_infos(): 1376 self.explode_infos( 1377 prefix=self.get_explode_infos_prefix(), 1378 fields=self.get_explode_infos_fields(), 1379 force=True, 1380 ) 1381 1382 # Create index after insertion 1383 self.create_indexes()
The load_data function reads a VCF file and inserts it into a table, with options to drop the
table before loading the data and specify a sample size.
Parameters
- input_file: The path to the input file. This is the VCF file that will be loaded into the table
- drop_variants_table: The
drop_variants_tableparameter is a boolean flag that determines whether the variants table should be dropped before loading the data. If set toTrue, the variants table will be dropped. If set toFalse(default), the variants table will not be dropped, defaults to False - sample_size: The
sample_sizeparameter determines the number of rows to be sampled from the input file. If it is set toNone, the default value of 20480 will be used, defaults to 20480
1385 def get_explode_infos(self) -> bool: 1386 """ 1387 The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting 1388 to False if it is not set. 1389 :return: The method is returning the value of the "explode_infos" parameter, which is a boolean 1390 value. If the parameter is not present, it will return False. 1391 """ 1392 1393 return self.get_param().get("explode", {}).get("explode_infos", False)
The function get_explode_infos returns the value of the "explode_infos" parameter, defaulting
to False if it is not set.
Returns
The method is returning the value of the "explode_infos" parameter, which is a boolean value. If the parameter is not present, it will return False.
1395 def get_explode_infos_fields( 1396 self, 1397 explode_infos_fields: str = None, 1398 remove_fields_not_in_header: bool = False, 1399 ) -> list: 1400 """ 1401 The `get_explode_infos_fields` function returns a list of exploded information fields based on 1402 the input parameter `explode_infos_fields`. 1403 1404 :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the 1405 fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a 1406 comma-separated list of field names to explode 1407 :type explode_infos_fields: str 1408 :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean 1409 flag that determines whether to remove fields that are not present in the header. If it is set 1410 to `True`, any field that is not in the header will be excluded from the list of exploded 1411 information fields. If it is set to `, defaults to False 1412 :type remove_fields_not_in_header: bool (optional) 1413 :return: The function `get_explode_infos_fields` returns a list of exploded information fields. 1414 If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty 1415 list. If the parameter is provided and its value is "ALL", it also returns an empty list. 1416 Otherwise, it returns a list of exploded information fields after removing any spaces and 1417 splitting the string by commas. 1418 """ 1419 1420 # If no fields, get it in param 1421 if not explode_infos_fields: 1422 explode_infos_fields = ( 1423 self.get_param().get("explode", {}).get("explode_infos_fields", None) 1424 ) 1425 1426 # If no fields, defined as all fields in header using keyword 1427 if not explode_infos_fields: 1428 explode_infos_fields = "*" 1429 1430 # If fields list not empty 1431 if explode_infos_fields: 1432 1433 # Input fields list 1434 if isinstance(explode_infos_fields, str): 1435 fields_input = explode_infos_fields.split(",") 1436 elif isinstance(explode_infos_fields, list): 1437 fields_input = explode_infos_fields 1438 else: 1439 fields_input = [] 1440 1441 # Fields list without * keyword 1442 fields_without_all = fields_input.copy() 1443 if "*".casefold() in (item.casefold() for item in fields_without_all): 1444 fields_without_all.remove("*") 1445 1446 # Fields in header 1447 fields_in_header = sorted(list(set(self.get_header().infos))) 1448 1449 # Construct list of fields 1450 fields_output = [] 1451 for field in fields_input: 1452 1453 # Strip field 1454 field = field.strip() 1455 1456 # format keyword * in regex 1457 if field.upper() in ["*"]: 1458 field = ".*" 1459 1460 # Find all fields with pattern 1461 r = re.compile(field) 1462 fields_search = sorted(list(filter(r.match, fields_in_header))) 1463 1464 # Remove fields input from search 1465 if field in fields_search: 1466 fields_search = [field] 1467 elif fields_search != [field]: 1468 fields_search = sorted( 1469 list(set(fields_search).difference(fields_input)) 1470 ) 1471 1472 # If field is not in header (avoid not well formatted header) 1473 if not fields_search and not remove_fields_not_in_header: 1474 fields_search = [field] 1475 1476 # Add found fields 1477 for new_field in fields_search: 1478 # Add field, if not already exists, and if it is in header (if asked) 1479 if ( 1480 new_field not in fields_output 1481 and ( 1482 not remove_fields_not_in_header 1483 or new_field in fields_in_header 1484 ) 1485 and new_field not in [".*"] 1486 ): 1487 fields_output.append(new_field) 1488 1489 return fields_output 1490 1491 else: 1492 1493 return []
The get_explode_infos_fields function returns a list of exploded information fields based on
the input parameter explode_infos_fields.
Parameters
- explode_infos_fields: The
explode_infos_fieldsparameter is a string that specifies the fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a comma-separated list of field names to explode - remove_fields_not_in_header: The parameter
remove_fields_not_in_headeris a boolean flag that determines whether to remove fields that are not present in the header. If it is set toTrue, any field that is not in the header will be excluded from the list of exploded information fields. If it is set to `, defaults to False
Returns
The function
get_explode_infos_fieldsreturns a list of exploded information fields. If theexplode_infos_fieldsparameter is not provided or is set to None, it returns an empty list. If the parameter is provided and its value is "ALL", it also returns an empty list. Otherwise, it returns a list of exploded information fields after removing any spaces and splitting the string by commas.
1495 def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str: 1496 """ 1497 The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or 1498 the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is 1499 not provided. 1500 1501 :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a 1502 prefix to be used for exploding or expanding information 1503 :type explode_infos_prefix: str 1504 :return: the value of the variable `explode_infos_prefix`. 1505 """ 1506 1507 if not explode_infos_prefix: 1508 explode_infos_prefix = ( 1509 self.get_param().get("explode", {}).get("explode_infos_prefix", "") 1510 ) 1511 1512 return explode_infos_prefix
The function get_explode_infos_prefix returns the value of the explode_infos_prefix parameter, or
the value of self.get_param().get("explode_infos_prefix", None) if explode_infos_prefix is
not provided.
Parameters
- explode_infos_prefix: The parameter
explode_infos_prefixis a string that specifies a prefix to be used for exploding or expanding information
Returns
the value of the variable
explode_infos_prefix.
1514 def add_column( 1515 self, 1516 table_name, 1517 column_name, 1518 column_type, 1519 default_value=None, 1520 drop: bool = False, 1521 ) -> dict: 1522 """ 1523 The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it 1524 doesn't already exist. 1525 1526 :param table_name: The name of the table to which you want to add a column 1527 :param column_name: The parameter "column_name" is the name of the column that you want to add 1528 to the table 1529 :param column_type: The `column_type` parameter specifies the data type of the column that you 1530 want to add to the table. It should be a string that represents the desired data type, such as 1531 "INTEGER", "TEXT", "REAL", etc 1532 :param default_value: The `default_value` parameter is an optional parameter that specifies the 1533 default value for the newly added column. If a default value is provided, it will be assigned to 1534 the column for any existing rows that do not have a value for that column 1535 :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column 1536 if it already exists in the table. If `drop` is set to `True`, the function will drop the 1537 existing column before adding the new column. If `drop` is set to `False` (default),, defaults 1538 to False 1539 :type drop: bool (optional) 1540 :return: a boolean value indicating whether the column was successfully added to the table. 1541 """ 1542 1543 # added 1544 added = False 1545 dropped = False 1546 1547 # Check if the column already exists in the table 1548 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1549 columns = self.get_query_to_df(query).columns.tolist() 1550 if column_name.upper() in [c.upper() for c in columns]: 1551 log.debug( 1552 f"The {column_name} column already exists in the {table_name} table" 1553 ) 1554 if drop: 1555 self.drop_column(table_name=table_name, column_name=column_name) 1556 dropped = True 1557 else: 1558 return None 1559 else: 1560 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1561 1562 # Add column in table 1563 add_column_query = ( 1564 f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """ 1565 ) 1566 if default_value is not None: 1567 add_column_query += f" DEFAULT {default_value}" 1568 self.execute_query(add_column_query) 1569 added = not dropped 1570 log.debug( 1571 f"The {column_name} column was successfully added to the {table_name} table" 1572 ) 1573 1574 if added: 1575 added_column = { 1576 "table_name": table_name, 1577 "column_name": column_name, 1578 "column_type": column_type, 1579 "default_value": default_value, 1580 } 1581 else: 1582 added_column = None 1583 1584 return added_column
The add_column function adds a column to a SQLite or DuckDB table with a default value if it
doesn't already exist.
Parameters
- table_name: The name of the table to which you want to add a column
- column_name: The parameter "column_name" is the name of the column that you want to add to the table
- column_type: The
column_typeparameter specifies the data type of the column that you want to add to the table. It should be a string that represents the desired data type, such as "INTEGER", "TEXT", "REAL", etc - default_value: The
default_valueparameter is an optional parameter that specifies the default value for the newly added column. If a default value is provided, it will be assigned to the column for any existing rows that do not have a value for that column - drop: The
dropparameter is a boolean flag that determines whether to drop the column if it already exists in the table. Ifdropis set toTrue, the function will drop the existing column before adding the new column. Ifdropis set toFalse(default),, defaults to False
Returns
a boolean value indicating whether the column was successfully added to the table.
1586 def drop_column( 1587 self, column: dict = None, table_name: str = None, column_name: str = None 1588 ) -> bool: 1589 """ 1590 The `drop_column` function drops a specified column from a given table in a database and returns 1591 True if the column was successfully dropped, and False if the column does not exist in the 1592 table. 1593 1594 :param column: The `column` parameter is a dictionary that contains information about the column 1595 you want to drop. It has two keys: 1596 :type column: dict 1597 :param table_name: The `table_name` parameter is the name of the table from which you want to 1598 drop a column 1599 :type table_name: str 1600 :param column_name: The `column_name` parameter is the name of the column that you want to drop 1601 from the table 1602 :type column_name: str 1603 :return: a boolean value. It returns True if the column was successfully dropped from the table, 1604 and False if the column does not exist in the table. 1605 """ 1606 1607 # Find column infos 1608 if column: 1609 if isinstance(column, dict): 1610 table_name = column.get("table_name", None) 1611 column_name = column.get("column_name", None) 1612 elif isinstance(column, str): 1613 table_name = self.get_table_variants() 1614 column_name = column 1615 else: 1616 table_name = None 1617 column_name = None 1618 1619 if not table_name and not column_name: 1620 return False 1621 1622 # Removed 1623 removed = False 1624 1625 # Check if the column already exists in the table 1626 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1627 columns = self.get_query_to_df(query).columns.tolist() 1628 if column_name in columns: 1629 log.debug(f"The {column_name} column exists in the {table_name} table") 1630 else: 1631 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1632 return False 1633 1634 # Add column in table # ALTER TABLE integers DROP k 1635 add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """ 1636 self.execute_query(add_column_query) 1637 removed = True 1638 log.debug( 1639 f"The {column_name} column was successfully dropped to the {table_name} table" 1640 ) 1641 1642 return removed
The drop_column function drops a specified column from a given table in a database and returns
True if the column was successfully dropped, and False if the column does not exist in the
table.
Parameters
- column: The
columnparameter is a dictionary that contains information about the column you want to drop. It has two keys: - table_name: The
table_nameparameter is the name of the table from which you want to drop a column - column_name: The
column_nameparameter is the name of the column that you want to drop from the table
Returns
a boolean value. It returns True if the column was successfully dropped from the table, and False if the column does not exist in the table.
1644 def explode_infos( 1645 self, 1646 prefix: str = None, 1647 create_index: bool = False, 1648 fields: list = None, 1649 force: bool = False, 1650 proccess_all_fields_together: bool = False, 1651 table: str = None, 1652 ) -> list: 1653 """ 1654 The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into 1655 individual columns, returning a list of added columns. 1656 1657 :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO 1658 fields. If the `prefix` is not provided or is set to `None`, the function will use the value of 1659 `self.get_explode_infos_prefix()` as the prefix 1660 :type prefix: str 1661 :param create_index: The `create_index` parameter is a boolean flag that specifies whether to 1662 create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to 1663 `False`, indexes will not be created. The default value is `False`, defaults to False 1664 :type create_index: bool (optional) 1665 :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields 1666 that you want to explode into individual columns. If this parameter is not provided, all INFO 1667 fields will be exploded. You can specify the INFO fields you want to explode by passing them as 1668 a list to the ` 1669 :type fields: list 1670 :param force: The `force` parameter in the `explode_infos` function is a boolean flag that 1671 determines whether to drop and recreate a column if it already exists in the table. If `force` 1672 is set to `True`, the column will be dropped and recreated. If `force` is set to `False, 1673 defaults to False 1674 :type force: bool (optional) 1675 :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean 1676 flag that determines whether to process all the INFO fields together or individually. If set to 1677 `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will 1678 be processed individually. The default value is, defaults to False 1679 :type proccess_all_fields_together: bool (optional) 1680 :param table: The `table` parameter in the `explode_infos` function is used to specify the name 1681 of the table where the exploded INFO fields will be added as individual columns. If you provide 1682 a value for the `table` parameter, the function will use that table name. If the `table` 1683 parameter is 1684 :type table: str 1685 :return: The `explode_infos` function returns a list of added columns. 1686 """ 1687 1688 # drop indexes 1689 self.drop_indexes() 1690 1691 # connexion format 1692 connexion_format = self.get_connexion_format() 1693 1694 # Access 1695 access = self.get_config().get("access", None) 1696 1697 # Added columns 1698 added_columns = [] 1699 1700 if access not in ["RO"]: 1701 1702 # prefix 1703 if prefix in [None, True] or not isinstance(prefix, str): 1704 if self.get_explode_infos_prefix() not in [None, True]: 1705 prefix = self.get_explode_infos_prefix() 1706 else: 1707 prefix = "INFO/" 1708 1709 # table variants 1710 if table is not None: 1711 table_variants = table 1712 else: 1713 table_variants = self.get_table_variants(clause="select") 1714 1715 # extra infos 1716 try: 1717 extra_infos = self.get_extra_infos() 1718 except: 1719 extra_infos = [] 1720 1721 # Header infos 1722 header_infos = self.get_header().infos 1723 1724 log.debug( 1725 f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields" 1726 ) 1727 1728 sql_info_alter_table_array = [] 1729 1730 # Info fields to check 1731 fields_list = list(header_infos) 1732 if fields: 1733 fields_list += fields 1734 fields_list = set(fields_list) 1735 1736 # If no fields 1737 if not fields: 1738 fields = [] 1739 1740 # Translate fields if patterns 1741 fields = self.get_explode_infos_fields(explode_infos_fields=fields) 1742 1743 for info in fields: 1744 1745 info_id_sql = prefix + info 1746 1747 if ( 1748 info in fields_list 1749 or prefix + info in fields_list 1750 or info in extra_infos 1751 ): 1752 1753 log.debug(f"Explode INFO fields - ADD '{info}' annotations fields") 1754 1755 if info in header_infos: 1756 info_type = header_infos[info].type 1757 info_num = header_infos[info].num 1758 else: 1759 info_type = "String" 1760 info_num = 0 1761 1762 type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR") 1763 if info_num != 1: 1764 type_sql = "VARCHAR" 1765 1766 # Add field 1767 added_column = self.add_column( 1768 table_name=table_variants, 1769 column_name=info_id_sql, 1770 column_type=type_sql, 1771 default_value="null", 1772 drop=force, 1773 ) 1774 1775 if added_column: 1776 added_columns.append(added_column) 1777 1778 if added_column or force: 1779 1780 # add field to index 1781 self.index_additionnal_fields.append(info_id_sql) 1782 1783 # Update field array 1784 if connexion_format in ["duckdb"]: 1785 update_info_field = f""" 1786 "{info_id_sql}" = 1787 CASE 1788 WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL 1789 ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) 1790 END 1791 """ 1792 elif connexion_format in ["sqlite"]: 1793 update_info_field = f""" 1794 "{info_id_sql}" = 1795 CASE 1796 WHEN instr(INFO, '{info}=') = 0 THEN NULL 1797 WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1) 1798 ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1) 1799 END 1800 """ 1801 1802 sql_info_alter_table_array.append(update_info_field) 1803 1804 if sql_info_alter_table_array: 1805 1806 # By chromosomes 1807 try: 1808 chromosomes_list = list( 1809 self.get_query_to_df( 1810 f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """ 1811 )["#CHROM"] 1812 ) 1813 except: 1814 chromosomes_list = [None] 1815 1816 for chrom in chromosomes_list: 1817 log.debug(f"Explode INFO fields - Chromosome {chrom}...") 1818 1819 # Where clause 1820 where_clause = "" 1821 if chrom and len(chromosomes_list) > 1: 1822 where_clause = f""" WHERE "#CHROM" = '{chrom}' """ 1823 1824 # Update table 1825 if proccess_all_fields_together: 1826 sql_info_alter_table_array_join = ", ".join( 1827 sql_info_alter_table_array 1828 ) 1829 if sql_info_alter_table_array_join: 1830 sql_info_alter_table = f""" 1831 UPDATE {table_variants} 1832 SET {sql_info_alter_table_array_join} 1833 {where_clause} 1834 """ 1835 log.debug( 1836 f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..." 1837 ) 1838 # log.debug(sql_info_alter_table) 1839 self.conn.execute(sql_info_alter_table) 1840 else: 1841 sql_info_alter_num = 0 1842 for sql_info_alter in sql_info_alter_table_array: 1843 sql_info_alter_num += 1 1844 sql_info_alter_table = f""" 1845 UPDATE {table_variants} 1846 SET {sql_info_alter} 1847 {where_clause} 1848 """ 1849 log.debug( 1850 f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..." 1851 ) 1852 # log.debug(sql_info_alter_table) 1853 self.conn.execute(sql_info_alter_table) 1854 1855 # create indexes 1856 if create_index: 1857 self.create_indexes() 1858 1859 return added_columns
The explode_infos function in Python takes a VCF file and explodes the INFO fields into
individual columns, returning a list of added columns.
Parameters
- prefix: The
prefixparameter is a string that is used as a prefix for the exploded INFO fields. If theprefixis not provided or is set toNone, the function will use the value ofself.get_explode_infos_prefix()as the prefix - create_index: The
create_indexparameter is a boolean flag that specifies whether to create indexes on the exploded INFO fields. If set toTrue, indexes will be created; if set toFalse, indexes will not be created. The default value isFalse, defaults to False - fields: The
fieldsparameter in theexplode_infosfunction is a list of INFO fields that you want to explode into individual columns. If this parameter is not provided, all INFO fields will be exploded. You can specify the INFO fields you want to explode by passing them as a list to the ` - force: The
forceparameter in theexplode_infosfunction is a boolean flag that determines whether to drop and recreate a column if it already exists in the table. Ifforceis set toTrue, the column will be dropped and recreated. Ifforceis set to `False, defaults to False - proccess_all_fields_together: The
proccess_all_fields_togetherparameter is a boolean flag that determines whether to process all the INFO fields together or individually. If set toTrue, all the INFO fields will be processed together. If set toFalse, each INFO field will be processed individually. The default value is, defaults to False - table: The
tableparameter in theexplode_infosfunction is used to specify the name of the table where the exploded INFO fields will be added as individual columns. If you provide a value for thetableparameter, the function will use that table name. If thetableparameter is
Returns
The
explode_infosfunction returns a list of added columns.
1861 def create_indexes(self) -> None: 1862 """ 1863 Create indexes on the table after insertion 1864 """ 1865 1866 # Access 1867 access = self.get_config().get("access", None) 1868 1869 # get table variants 1870 table_variants = self.get_table_variants("FROM") 1871 1872 if self.get_indexing() and access not in ["RO"]: 1873 # Create index 1874 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")' 1875 self.conn.execute(sql_create_table_index) 1876 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")' 1877 self.conn.execute(sql_create_table_index) 1878 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")' 1879 self.conn.execute(sql_create_table_index) 1880 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")' 1881 self.conn.execute(sql_create_table_index) 1882 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")' 1883 self.conn.execute(sql_create_table_index) 1884 for field in self.index_additionnal_fields: 1885 sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """ 1886 self.conn.execute(sql_create_table_index)
Create indexes on the table after insertion
1888 def drop_indexes(self) -> None: 1889 """ 1890 Create indexes on the table after insertion 1891 """ 1892 1893 # Access 1894 access = self.get_config().get("access", None) 1895 1896 # get table variants 1897 table_variants = self.get_table_variants("FROM") 1898 1899 # Get database format 1900 connexion_format = self.get_connexion_format() 1901 1902 if access not in ["RO"]: 1903 if connexion_format in ["duckdb"]: 1904 sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'" 1905 elif connexion_format in ["sqlite"]: 1906 sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';" 1907 1908 list_indexes = self.conn.execute(sql_list_indexes) 1909 index_names = [row[0] for row in list_indexes.fetchall()] 1910 for index in index_names: 1911 sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """ 1912 self.conn.execute(sql_drop_table_index)
Create indexes on the table after insertion
1914 def read_vcf_header(self, f) -> list: 1915 """ 1916 It reads the header of a VCF file and returns a list of the header lines 1917 1918 :param f: the file object 1919 :return: The header lines of the VCF file. 1920 """ 1921 1922 header_list = [] 1923 for line in f: 1924 header_list.append(line) 1925 if line.startswith("#CHROM"): 1926 break 1927 return header_list
It reads the header of a VCF file and returns a list of the header lines
Parameters
- f: the file object
Returns
The header lines of the VCF file.
1929 def read_vcf_header_file(self, file: str = None) -> list: 1930 """ 1931 The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and 1932 uncompressed files. 1933 1934 :param file: The `file` parameter is a string that represents the path to the VCF header file 1935 that you want to read. It is an optional parameter, so if you don't provide a value, it will 1936 default to `None` 1937 :type file: str 1938 :return: The function `read_vcf_header_file` returns a list. 1939 """ 1940 1941 if self.get_input_compressed(input_file=file): 1942 with bgzf.open(file, "rt") as f: 1943 return self.read_vcf_header(f=f) 1944 else: 1945 with open(file, "rt") as f: 1946 return self.read_vcf_header(f=f)
The read_vcf_header_file function reads the header of a VCF file, handling both compressed and
uncompressed files.
Parameters
- file: The
fileparameter is a string that represents the path to the VCF header file that you want to read. It is an optional parameter, so if you don't provide a value, it will default toNone
Returns
The function
read_vcf_header_filereturns a list.
1948 def execute_query(self, query: str): 1949 """ 1950 It takes a query as an argument, executes it, and returns the results 1951 1952 :param query: The query to be executed 1953 :return: The result of the query is being returned. 1954 """ 1955 if query: 1956 return self.conn.execute(query) # .fetchall() 1957 else: 1958 return None
It takes a query as an argument, executes it, and returns the results
Parameters
- query: The query to be executed
Returns
The result of the query is being returned.
1960 def export_output( 1961 self, 1962 output_file: str | None = None, 1963 output_header: str | None = None, 1964 export_header: bool = True, 1965 query: str | None = None, 1966 parquet_partitions: list | None = None, 1967 chunk_size: int | None = None, 1968 threads: int | None = None, 1969 sort: bool = False, 1970 index: bool = False, 1971 order_by: str | None = None, 1972 ) -> bool: 1973 """ 1974 The `export_output` function exports data from a VCF file to a specified output file in various 1975 formats, including VCF, CSV, TSV, PSV, and Parquet. 1976 1977 :param output_file: The `output_file` parameter is a string that specifies the name of the 1978 output file to be generated by the function. This is where the exported data will be saved 1979 :type output_file: str 1980 :param output_header: The `output_header` parameter is a string that specifies the name of the 1981 file where the header of the VCF file will be exported. If this parameter is not provided, the 1982 header will be exported to a file with the same name as the `output_file` parameter, but with 1983 the extension " 1984 :type output_header: str 1985 :param export_header: The `export_header` parameter is a boolean flag that determines whether 1986 the header of a VCF file should be exported to a separate file or not. If `export_header` is 1987 True, the header will be exported to a file. If `export_header` is False, the header will not 1988 be, defaults to True, if output format is not VCF 1989 :type export_header: bool (optional) 1990 :param query: The `query` parameter is an optional SQL query that can be used to filter and 1991 select specific data from the VCF file before exporting it. If provided, only the data that 1992 matches the query will be exported 1993 :type query: str 1994 :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the 1995 columns to be used for partitioning the Parquet file during export. Partitioning is a way to 1996 organize data in a hierarchical directory structure based on the values of one or more columns. 1997 This can improve query performance when working with large datasets 1998 :type parquet_partitions: list 1999 :param chunk_size: The `chunk_size` parameter specifies the number of 2000 records in batch when exporting data in Parquet format. This parameter is used for 2001 partitioning the Parquet file into multiple files. 2002 :type chunk_size: int 2003 :param threads: The `threads` parameter is an optional parameter that specifies the number of 2004 threads to be used during the export process. It determines the level of parallelism and can 2005 improve the performance of the export operation. If not provided, the function will use the 2006 default number of threads 2007 :type threads: int 2008 :param sort: The `sort` parameter is a boolean flag that determines whether the output file 2009 should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the 2010 genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to 2011 False 2012 :type sort: bool (optional) 2013 :param index: The `index` parameter is a boolean flag that determines whether an index should be 2014 created on the output file. If `index` is True, an index will be created. If `index` is False, 2015 no index will be created. The default value is False, defaults to False 2016 :type index: bool (optional) 2017 :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for 2018 sorting the output file. This parameter is only applicable when exporting data in VCF format 2019 :type order_by: str 2020 :return: a boolean value. It checks if the output file exists and returns True if it does, or 2021 None if it doesn't. 2022 """ 2023 2024 # Log 2025 log.info("Exporting...") 2026 2027 # Full path 2028 output_file = full_path(output_file) 2029 output_header = full_path(output_header) 2030 2031 # Config 2032 config = self.get_config() 2033 2034 # Param 2035 param = self.get_param() 2036 2037 # Tmp files to remove 2038 tmp_to_remove = [] 2039 2040 # If no output, get it 2041 if not output_file: 2042 output_file = self.get_output() 2043 2044 # If not threads 2045 if not threads: 2046 threads = self.get_threads() 2047 2048 # Auto header name with extension 2049 if export_header or output_header: 2050 if not output_header: 2051 output_header = f"{output_file}.hdr" 2052 # Export header 2053 self.export_header(output_file=output_file) 2054 2055 # Switch off export header if VCF output 2056 output_file_type = get_file_format(output_file) 2057 if output_file_type in ["vcf"]: 2058 export_header = False 2059 tmp_to_remove.append(output_header) 2060 2061 # Chunk size 2062 if not chunk_size: 2063 chunk_size = config.get("chunk_size", None) 2064 2065 # Parquet partition 2066 if not parquet_partitions: 2067 parquet_partitions = param.get("export", {}).get("parquet_partitions", None) 2068 if parquet_partitions and isinstance(parquet_partitions, str): 2069 parquet_partitions = parquet_partitions.split(",") 2070 2071 # Order by 2072 if not order_by: 2073 order_by = param.get("export", {}).get("order_by", "") 2074 2075 # Header in output 2076 header_in_output = param.get("export", {}).get("include_header", False) 2077 2078 # Database 2079 database_source = self.get_connexion() 2080 2081 # Connexion format 2082 connexion_format = self.get_connexion_format() 2083 2084 # Explode infos 2085 if self.get_explode_infos(): 2086 self.explode_infos( 2087 prefix=self.get_explode_infos_prefix(), 2088 fields=self.get_explode_infos_fields(), 2089 force=False, 2090 ) 2091 2092 # if connexion_format in ["sqlite"] or query: 2093 if connexion_format in ["sqlite"]: 2094 2095 # Export in Parquet 2096 random_tmp = "".join( 2097 random.choice(string.ascii_lowercase) for i in range(10) 2098 ) 2099 database_source = f"""{output_file}.{random_tmp}.database_export.parquet""" 2100 tmp_to_remove.append(database_source) 2101 2102 # Table Variants 2103 table_variants = self.get_table_variants() 2104 2105 # Create export query 2106 sql_query_export_subquery = f""" 2107 SELECT * FROM {table_variants} 2108 """ 2109 2110 # Write source file 2111 fp.write(database_source, self.get_query_to_df(sql_query_export_subquery)) 2112 2113 # Create database 2114 database = Database( 2115 database=database_source, 2116 table="variants", 2117 header_file=output_header, 2118 conn_config=self.get_connexion_config(), 2119 ) 2120 2121 # Existing colomns header 2122 # existing_columns_header = database.get_header_file_columns(output_header) 2123 existing_columns_header = database.get_header_columns_from_database() 2124 2125 # Export file 2126 database.export( 2127 output_database=output_file, 2128 output_header=output_header, 2129 existing_columns_header=existing_columns_header, 2130 parquet_partitions=parquet_partitions, 2131 chunk_size=chunk_size, 2132 threads=threads, 2133 sort=sort, 2134 index=index, 2135 header_in_output=header_in_output, 2136 order_by=order_by, 2137 query=query, 2138 export_header=export_header, 2139 ) 2140 2141 # Remove 2142 remove_if_exists(tmp_to_remove) 2143 2144 return (os.path.exists(output_file) or None) and ( 2145 os.path.exists(output_file) or None 2146 )
The export_output function exports data from a VCF file to a specified output file in various
formats, including VCF, CSV, TSV, PSV, and Parquet.
Parameters
- output_file: The
output_fileparameter is a string that specifies the name of the output file to be generated by the function. This is where the exported data will be saved - output_header: The
output_headerparameter is a string that specifies the name of the file where the header of the VCF file will be exported. If this parameter is not provided, the header will be exported to a file with the same name as theoutput_fileparameter, but with the extension " - export_header: The
export_headerparameter is a boolean flag that determines whether the header of a VCF file should be exported to a separate file or not. Ifexport_headeris True, the header will be exported to a file. Ifexport_headeris False, the header will not be, defaults to True, if output format is not VCF - query: The
queryparameter is an optional SQL query that can be used to filter and select specific data from the VCF file before exporting it. If provided, only the data that matches the query will be exported - parquet_partitions: The
parquet_partitionsparameter is a list that specifies the columns to be used for partitioning the Parquet file during export. Partitioning is a way to organize data in a hierarchical directory structure based on the values of one or more columns. This can improve query performance when working with large datasets - chunk_size: The
chunk_sizeparameter specifies the number of records in batch when exporting data in Parquet format. This parameter is used for partitioning the Parquet file into multiple files. - threads: The
threadsparameter is an optional parameter that specifies the number of threads to be used during the export process. It determines the level of parallelism and can improve the performance of the export operation. If not provided, the function will use the default number of threads - sort: The
sortparameter is a boolean flag that determines whether the output file should be sorted or not. Ifsortis set toTrue, the output file will be sorted based on the genomic coordinates of the variants. By default, the value ofsortisFalse, defaults to False - index: The
indexparameter is a boolean flag that determines whether an index should be created on the output file. Ifindexis True, an index will be created. Ifindexis False, no index will be created. The default value is False, defaults to False - order_by: The
order_byparameter is a string that specifies the column(s) to use for sorting the output file. This parameter is only applicable when exporting data in VCF format
Returns
a boolean value. It checks if the output file exists and returns True if it does, or None if it doesn't.
2148 def get_extra_infos(self, table: str = None) -> list: 2149 """ 2150 The `get_extra_infos` function returns a list of columns that are in a specified table but not 2151 in the header. 2152 2153 :param table: The `table` parameter in the `get_extra_infos` function is used to specify the 2154 name of the table from which you want to retrieve the extra columns that are not present in the 2155 header. If the `table` parameter is not provided when calling the function, it will default to 2156 using the variants 2157 :type table: str 2158 :return: A list of columns that are in the specified table but not in the header of the table. 2159 """ 2160 2161 header_columns = [] 2162 2163 if not table: 2164 table = self.get_table_variants(clause="from") 2165 header_columns = self.get_header_columns() 2166 2167 # Check all columns in the database 2168 query = f""" SELECT * FROM {table} LIMIT 1 """ 2169 log.debug(f"query {query}") 2170 table_columns = self.get_query_to_df(query).columns.tolist() 2171 extra_columns = [] 2172 2173 # Construct extra infos (not in header) 2174 for column in table_columns: 2175 if column not in header_columns: 2176 extra_columns.append(column) 2177 2178 return extra_columns
The get_extra_infos function returns a list of columns that are in a specified table but not
in the header.
Parameters
- table: The
tableparameter in theget_extra_infosfunction is used to specify the name of the table from which you want to retrieve the extra columns that are not present in the header. If thetableparameter is not provided when calling the function, it will default to using the variants
Returns
A list of columns that are in the specified table but not in the header of the table.
2180 def get_extra_infos_sql(self, table: str = None) -> str: 2181 """ 2182 It returns a string of the extra infos, separated by commas, and each extra info is surrounded 2183 by double quotes 2184 2185 :param table: The name of the table to get the extra infos from. If None, the default table is 2186 used 2187 :type table: str 2188 :return: A string of the extra infos 2189 """ 2190 2191 return ", ".join( 2192 ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)] 2193 )
It returns a string of the extra infos, separated by commas, and each extra info is surrounded by double quotes
Parameters
- table: The name of the table to get the extra infos from. If None, the default table is used
Returns
A string of the extra infos
2195 def export_header( 2196 self, 2197 header_name: str = None, 2198 output_file: str = None, 2199 output_file_ext: str = ".hdr", 2200 clean_header: bool = True, 2201 remove_chrom_line: bool = False, 2202 ) -> str: 2203 """ 2204 The `export_header` function takes a VCF file, extracts the header, modifies it according to 2205 specified options, and writes it to a new file. 2206 2207 :param header_name: The `header_name` parameter is the name of the header file to be created. If 2208 this parameter is not specified, the header will be written to the output file 2209 :type header_name: str 2210 :param output_file: The `output_file` parameter in the `export_header` function is used to 2211 specify the name of the output file where the header will be written. If this parameter is not 2212 provided, the header will be written to a temporary file 2213 :type output_file: str 2214 :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a 2215 string that represents the extension of the output header file. By default, it is set to ".hdr" 2216 if not specified by the user. This extension will be appended to the `output_file` name to 2217 create the final, defaults to .hdr 2218 :type output_file_ext: str (optional) 2219 :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean 2220 flag that determines whether the header should be cleaned or not. When `clean_header` is set to 2221 `True`, the function will clean the header by modifying certain lines based on a specific 2222 pattern. If `clean_header`, defaults to True 2223 :type clean_header: bool (optional) 2224 :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a 2225 boolean flag that determines whether the #CHROM line should be removed from the header before 2226 writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `, 2227 defaults to False 2228 :type remove_chrom_line: bool (optional) 2229 :return: The function `export_header` returns the name of the temporary header file that is 2230 created. 2231 """ 2232 2233 if not header_name and not output_file: 2234 output_file = self.get_output() 2235 2236 if self.get_header(): 2237 2238 # Get header object 2239 header_obj = self.get_header() 2240 2241 # Create database 2242 db_for_header = Database(database=self.get_input()) 2243 2244 # Get real columns in the file 2245 db_header_columns = db_for_header.get_columns() 2246 2247 with tempfile.TemporaryDirectory() as tmpdir: 2248 2249 # Write header file 2250 header_file_tmp = os.path.join(tmpdir, "header") 2251 f = open(header_file_tmp, "w") 2252 vcf.Writer(f, header_obj) 2253 f.close() 2254 2255 # Replace #CHROM line with rel columns 2256 header_list = db_for_header.read_header_file( 2257 header_file=header_file_tmp 2258 ) 2259 header_list[-1] = "\t".join(db_header_columns) 2260 2261 # Remove CHROM line 2262 if remove_chrom_line: 2263 header_list.pop() 2264 2265 # Clean header 2266 if clean_header: 2267 header_list_clean = [] 2268 for head in header_list: 2269 # Clean head for malformed header 2270 head_clean = head 2271 head_clean = re.subn( 2272 "##FORMAT=<ID=(.*),Number=(.*),Type=Flag", 2273 r"##FORMAT=<ID=\1,Number=\2,Type=String", 2274 head_clean, 2275 2, 2276 )[0] 2277 # Write header 2278 header_list_clean.append(head_clean) 2279 header_list = header_list_clean 2280 2281 tmp_header_name = output_file + output_file_ext 2282 2283 f = open(tmp_header_name, "w") 2284 for line in header_list: 2285 f.write(line) 2286 f.close() 2287 2288 return tmp_header_name
The export_header function takes a VCF file, extracts the header, modifies it according to
specified options, and writes it to a new file.
Parameters
- header_name: The
header_nameparameter is the name of the header file to be created. If this parameter is not specified, the header will be written to the output file - output_file: The
output_fileparameter in theexport_headerfunction is used to specify the name of the output file where the header will be written. If this parameter is not provided, the header will be written to a temporary file - output_file_ext: The
output_file_extparameter in theexport_headerfunction is a string that represents the extension of the output header file. By default, it is set to ".hdr" if not specified by the user. This extension will be appended to theoutput_filename to create the final, defaults to .hdr - clean_header: The
clean_headerparameter in theexport_headerfunction is a boolean flag that determines whether the header should be cleaned or not. Whenclean_headeris set toTrue, the function will clean the header by modifying certain lines based on a specific pattern. Ifclean_header, defaults to True - remove_chrom_line: The
remove_chrom_lineparameter in theexport_headerfunction is a boolean flag that determines whether the #CHROM line should be removed from the header before writing it to the output file. If set toTrue, the #CHROM line will be removed; if set to `, defaults to False
Returns
The function
export_headerreturns the name of the temporary header file that is created.
2290 def export_variant_vcf( 2291 self, 2292 vcf_file, 2293 remove_info: bool = False, 2294 add_samples: bool = True, 2295 list_samples: list = [], 2296 where_clause: str = "", 2297 index: bool = False, 2298 threads: int | None = None, 2299 ) -> bool | None: 2300 """ 2301 The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to 2302 remove INFO field, add samples, and control compression and indexing. 2303 2304 :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be 2305 written to. It is the output file that will contain the filtered VCF data based on the specified 2306 parameters 2307 :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a 2308 boolean flag that determines whether to remove the INFO field from the output VCF file. If set 2309 to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included 2310 in, defaults to False 2311 :type remove_info: bool (optional) 2312 :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether 2313 the samples should be added to the VCF file or not. If set to True, the samples will be added. 2314 If set to False, the samples will be removed. The default value is True, defaults to True 2315 :type add_samples: bool (optional) 2316 :param list_samples: The `list_samples` parameter is a list of samples that you want to include 2317 in the output VCF file. By default, all samples will be included. If you provide a list of 2318 samples, only those samples will be included in the output file 2319 :type list_samples: list 2320 :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that 2321 determines whether or not to create an index for the output VCF file. If `index` is set to 2322 `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False 2323 :type index: bool (optional) 2324 :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the 2325 number of threads to use for exporting the VCF file. It determines how many parallel threads 2326 will be used during the export process. More threads can potentially speed up the export process 2327 by utilizing multiple cores of the processor. If 2328 :type threads: int | None 2329 :return: The `export_variant_vcf` function returns the result of calling the `export_output` 2330 method with various parameters including the output file, query, threads, sort flag, and index 2331 flag. The `export_output` method is responsible for exporting the VCF data based on the 2332 specified parameters and configurations provided in the `export_variant_vcf` function. 2333 """ 2334 2335 # Config 2336 config = self.get_config() 2337 2338 # Extract VCF 2339 log.debug("Export VCF...") 2340 2341 # Table variants 2342 table_variants = self.get_table_variants() 2343 2344 # Threads 2345 if not threads: 2346 threads = self.get_threads() 2347 2348 # Info fields 2349 if remove_info: 2350 if not isinstance(remove_info, str): 2351 remove_info = "." 2352 info_field = f"""'{remove_info}' as INFO""" 2353 else: 2354 info_field = "INFO" 2355 2356 # Samples fields 2357 if add_samples: 2358 if not list_samples: 2359 list_samples = self.get_header_sample_list() 2360 if list_samples: 2361 samples_fields = " , FORMAT , " + " , ".join(list_samples) 2362 else: 2363 samples_fields = "" 2364 log.debug(f"samples_fields: {samples_fields}") 2365 else: 2366 samples_fields = "" 2367 2368 # Where clause 2369 if where_clause is None: 2370 where_clause = "" 2371 2372 # Variants 2373 select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """ 2374 sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """ 2375 log.debug(f"sql_query_select={sql_query_select}") 2376 2377 return self.export_output( 2378 output_file=vcf_file, 2379 output_header=None, 2380 export_header=True, 2381 query=sql_query_select, 2382 parquet_partitions=None, 2383 chunk_size=config.get("chunk_size", None), 2384 threads=threads, 2385 sort=True, 2386 index=index, 2387 order_by=None, 2388 )
The export_variant_vcf function exports a VCF file with specified samples, allowing options to
remove INFO field, add samples, and control compression and indexing.
Parameters
- vcf_file: The
vcf_fileparameter is the name of the file where the VCF data will be written to. It is the output file that will contain the filtered VCF data based on the specified parameters - remove_info: The
remove_infoparameter in theexport_variant_vcffunction is a boolean flag that determines whether to remove the INFO field from the output VCF file. If set toTrue, the INFO field will be removed. If set toFalse, the INFO field will be included in, defaults to False - add_samples: The
add_samplesparameter is a boolean parameter that determines whether the samples should be added to the VCF file or not. If set to True, the samples will be added. If set to False, the samples will be removed. The default value is True, defaults to True - list_samples: The
list_samplesparameter is a list of samples that you want to include in the output VCF file. By default, all samples will be included. If you provide a list of samples, only those samples will be included in the output file - index: The
indexparameter in theexport_variant_vcffunction is a boolean flag that determines whether or not to create an index for the output VCF file. Ifindexis set toTrue, the output VCF file will be indexed using tabix. Ifindex, defaults to False - threads: The
threadsparameter in theexport_variant_vcffunction specifies the number of threads to use for exporting the VCF file. It determines how many parallel threads will be used during the export process. More threads can potentially speed up the export process by utilizing multiple cores of the processor. If
Returns
The
export_variant_vcffunction returns the result of calling theexport_outputmethod with various parameters including the output file, query, threads, sort flag, and index flag. Theexport_outputmethod is responsible for exporting the VCF data based on the specified parameters and configurations provided in theexport_variant_vcffunction.
2390 def run_commands(self, commands: list = [], threads: int = 1) -> None: 2391 """ 2392 It takes a list of commands and runs them in parallel using the number of threads specified 2393 2394 :param commands: A list of commands to run 2395 :param threads: The number of threads to use, defaults to 1 (optional) 2396 """ 2397 2398 run_parallel_commands(commands, threads)
It takes a list of commands and runs them in parallel using the number of threads specified
Parameters
- commands: A list of commands to run
- threads: The number of threads to use, defaults to 1 (optional)
2400 def get_threads(self, default: int = 1) -> int: 2401 """ 2402 This function returns the number of threads to use for a job, with a default value of 1 if not 2403 specified. 2404 2405 :param default: The `default` parameter in the `get_threads` method is used to specify the 2406 default number of threads to use if no specific value is provided. If no value is provided for 2407 the `threads` parameter in the configuration or input parameters, the `default` value will be 2408 used, defaults to 1 2409 :type default: int (optional) 2410 :return: the number of threads to use for the current job. 2411 """ 2412 2413 # Config 2414 config = self.get_config() 2415 2416 # Param 2417 param = self.get_param() 2418 2419 # Input threads 2420 input_thread = param.get("threads", config.get("threads", None)) 2421 2422 # Check threads 2423 if not input_thread: 2424 threads = default 2425 elif int(input_thread) <= 0: 2426 threads = os.cpu_count() 2427 else: 2428 threads = int(input_thread) 2429 return threads
This function returns the number of threads to use for a job, with a default value of 1 if not specified.
Parameters
- default: The
defaultparameter in theget_threadsmethod is used to specify the default number of threads to use if no specific value is provided. If no value is provided for thethreadsparameter in the configuration or input parameters, thedefaultvalue will be used, defaults to 1
Returns
the number of threads to use for the current job.
2431 def get_memory(self, default: str = None) -> str: 2432 """ 2433 This function retrieves the memory value from parameters or configuration with a default value 2434 if not found. 2435 2436 :param default: The `get_memory` function takes in a default value as a string parameter. This 2437 default value is used as a fallback in case the `memory` parameter is not provided in the 2438 `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary, 2439 the function 2440 :type default: str 2441 :return: The `get_memory` function returns a string value representing the memory parameter. If 2442 the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will 2443 return the default value provided as an argument to the function. 2444 """ 2445 2446 # Config 2447 config = self.get_config() 2448 2449 # Param 2450 param = self.get_param() 2451 2452 # Input threads 2453 input_memory = param.get("memory", config.get("memory", None)) 2454 2455 # Check threads 2456 if input_memory: 2457 memory = input_memory 2458 else: 2459 memory = default 2460 2461 return memory
This function retrieves the memory value from parameters or configuration with a default value if not found.
Parameters
- default: The
get_memoryfunction takes in a default value as a string parameter. This default value is used as a fallback in case thememoryparameter is not provided in theparamdictionary or theconfigdictionary. Ifmemoryis not found in either dictionary, the function
Returns
The
get_memoryfunction returns a string value representing the memory parameter. If theinput_memoryis provided in the parameters, it will return that value. Otherwise, it will return the default value provided as an argument to the function.
2463 def update_from_vcf(self, vcf_file: str) -> None: 2464 """ 2465 > If the database is duckdb, then use the parquet method, otherwise use the sqlite method 2466 2467 :param vcf_file: the path to the VCF file 2468 """ 2469 2470 connexion_format = self.get_connexion_format() 2471 2472 if connexion_format in ["duckdb"]: 2473 self.update_from_vcf_duckdb(vcf_file) 2474 elif connexion_format in ["sqlite"]: 2475 self.update_from_vcf_sqlite(vcf_file)
If the database is duckdb, then use the parquet method, otherwise use the sqlite method
Parameters
- vcf_file: the path to the VCF file
2477 def update_from_vcf_duckdb(self, vcf_file: str) -> None: 2478 """ 2479 It takes a VCF file and updates the INFO column of the variants table in the database with the 2480 INFO column of the VCF file 2481 2482 :param vcf_file: the path to the VCF file 2483 """ 2484 2485 # varaints table 2486 table_variants = self.get_table_variants() 2487 2488 # Loading VCF into temporaire table 2489 skip = self.get_header_length(file=vcf_file) 2490 vcf_df = pd.read_csv( 2491 vcf_file, 2492 sep="\t", 2493 engine="c", 2494 skiprows=skip, 2495 header=0, 2496 low_memory=False, 2497 ) 2498 sql_query_update = f""" 2499 UPDATE {table_variants} as table_variants 2500 SET INFO = concat( 2501 CASE 2502 WHEN INFO NOT IN ('', '.') 2503 THEN INFO 2504 ELSE '' 2505 END, 2506 ( 2507 SELECT 2508 concat( 2509 CASE 2510 WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.') 2511 THEN ';' 2512 ELSE '' 2513 END 2514 , 2515 CASE 2516 WHEN table_parquet.INFO NOT IN ('','.') 2517 THEN table_parquet.INFO 2518 ELSE '' 2519 END 2520 ) 2521 FROM vcf_df as table_parquet 2522 WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR) 2523 AND table_parquet.\"POS\" = table_variants.\"POS\" 2524 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 2525 AND table_parquet.\"REF\" = table_variants.\"REF\" 2526 AND table_parquet.INFO NOT IN ('','.') 2527 ) 2528 ) 2529 ; 2530 """ 2531 self.conn.execute(sql_query_update)
It takes a VCF file and updates the INFO column of the variants table in the database with the INFO column of the VCF file
Parameters
- vcf_file: the path to the VCF file
2533 def update_from_vcf_sqlite(self, vcf_file: str) -> None: 2534 """ 2535 It creates a temporary table in the SQLite database, loads the VCF file into the temporary 2536 table, then updates the INFO column of the variants table with the INFO column of the temporary 2537 table 2538 2539 :param vcf_file: The path to the VCF file you want to update the database with 2540 """ 2541 2542 # Create a temporary table for the VCF 2543 table_vcf = "tmp_vcf" 2544 sql_create = ( 2545 f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0" 2546 ) 2547 self.conn.execute(sql_create) 2548 2549 # Loading VCF into temporaire table 2550 vcf_df = pd.read_csv( 2551 vcf_file, sep="\t", comment="#", header=None, low_memory=False 2552 ) 2553 vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] 2554 vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False) 2555 2556 # Update table 'variants' with VCF data 2557 # warning: CONCAT as || operator 2558 sql_query_update = f""" 2559 UPDATE variants as table_variants 2560 SET INFO = CASE 2561 WHEN INFO NOT IN ('', '.') 2562 THEN INFO 2563 ELSE '' 2564 END || 2565 ( 2566 SELECT 2567 CASE 2568 WHEN table_variants.INFO NOT IN ('','.') 2569 AND table_vcf.INFO NOT IN ('','.') 2570 THEN ';' 2571 ELSE '' 2572 END || 2573 CASE 2574 WHEN table_vcf.INFO NOT IN ('','.') 2575 THEN table_vcf.INFO 2576 ELSE '' 2577 END 2578 FROM {table_vcf} as table_vcf 2579 WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\" 2580 AND table_vcf.\"POS\" = table_variants.\"POS\" 2581 AND table_vcf.\"ALT\" = table_variants.\"ALT\" 2582 AND table_vcf.\"REF\" = table_variants.\"REF\" 2583 ) 2584 """ 2585 self.conn.execute(sql_query_update) 2586 2587 # Drop temporary table 2588 sql_drop = f"DROP TABLE {table_vcf}" 2589 self.conn.execute(sql_drop)
It creates a temporary table in the SQLite database, loads the VCF file into the temporary table, then updates the INFO column of the variants table with the INFO column of the temporary table
Parameters
- vcf_file: The path to the VCF file you want to update the database with
2591 def drop_variants_table(self) -> None: 2592 """ 2593 > This function drops the variants table 2594 """ 2595 2596 table_variants = self.get_table_variants() 2597 sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}" 2598 self.conn.execute(sql_table_variants)
This function drops the variants table
2600 def set_variant_id( 2601 self, variant_id_column: str = "variant_id", force: bool = None 2602 ) -> str: 2603 """ 2604 It adds a column to the variants table called `variant_id` and populates it with a hash of the 2605 `#CHROM`, `POS`, `REF`, and `ALT` columns 2606 2607 :param variant_id_column: The name of the column to be created in the variants table, defaults 2608 to variant_id 2609 :type variant_id_column: str (optional) 2610 :param force: If True, the variant_id column will be created even if it already exists 2611 :type force: bool 2612 :return: The name of the column that contains the variant_id 2613 """ 2614 2615 # Assembly 2616 assembly = self.get_param().get( 2617 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 2618 ) 2619 2620 # INFO/Tag prefix 2621 prefix = self.get_explode_infos_prefix() 2622 2623 # Explode INFO/SVTYPE 2624 added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"]) 2625 2626 # variants table 2627 table_variants = self.get_table_variants() 2628 2629 # variant_id column 2630 if not variant_id_column: 2631 variant_id_column = "variant_id" 2632 2633 # Creta variant_id column 2634 if "variant_id" not in self.get_extra_infos() or force: 2635 2636 # Create column 2637 self.add_column( 2638 table_name=table_variants, 2639 column_name=variant_id_column, 2640 column_type="UBIGINT", 2641 default_value="0", 2642 ) 2643 2644 # Update column 2645 self.conn.execute( 2646 f""" 2647 UPDATE {table_variants} 2648 SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"') 2649 """ 2650 ) 2651 2652 # Remove added columns 2653 for added_column in added_columns: 2654 self.drop_column(column=added_column) 2655 2656 # return variant_id column name 2657 return variant_id_column
It adds a column to the variants table called variant_id and populates it with a hash of the
#CHROM, POS, REF, and ALT columns
Parameters
- variant_id_column: The name of the column to be created in the variants table, defaults to variant_id
- force: If True, the variant_id column will be created even if it already exists
Returns
The name of the column that contains the variant_id
2659 def get_variant_id_column( 2660 self, variant_id_column: str = "variant_id", force: bool = None 2661 ) -> str: 2662 """ 2663 This function returns the variant_id column name 2664 2665 :param variant_id_column: The name of the column in the dataframe that contains the variant IDs, 2666 defaults to variant_id 2667 :type variant_id_column: str (optional) 2668 :param force: If True, will force the variant_id to be set to the value of variant_id_column. If 2669 False, will only set the variant_id if it is not already set. If None, will set the variant_id 2670 if it is not already set, or if it is set 2671 :type force: bool 2672 :return: The variant_id column name. 2673 """ 2674 2675 return self.set_variant_id(variant_id_column=variant_id_column, force=force)
This function returns the variant_id column name
Parameters
- variant_id_column: The name of the column in the dataframe that contains the variant IDs, defaults to variant_id
- force: If True, will force the variant_id to be set to the value of variant_id_column. If False, will only set the variant_id if it is not already set. If None, will set the variant_id if it is not already set, or if it is set
Returns
The variant_id column name.
2681 def scan_databases( 2682 self, 2683 database_formats: list = ["parquet"], 2684 database_releases: list = ["current"], 2685 ) -> dict: 2686 """ 2687 The function `scan_databases` scans for available databases based on specified formats and 2688 releases. 2689 2690 :param database_formats: The `database_formats` parameter is a list that specifies the formats 2691 of the databases to be scanned. In this case, the accepted format is "parquet" 2692 :type database_formats: list ["parquet"] 2693 :param database_releases: The `database_releases` parameter is a list that specifies the 2694 releases of the databases to be scanned. In the provided function, the default value for 2695 `database_releases` is set to `["current"]`, meaning that by default, the function will scan 2696 databases that are in the "current" 2697 :type database_releases: list 2698 :return: The function `scan_databases` returns a dictionary containing information about 2699 databases that match the specified formats and releases. 2700 """ 2701 2702 # Config 2703 config = self.get_config() 2704 2705 # Param 2706 param = self.get_param() 2707 2708 # Param - Assembly 2709 assembly = param.get("assembly", config.get("assembly", None)) 2710 if not assembly: 2711 assembly = DEFAULT_ASSEMBLY 2712 log.warning(f"Default assembly '{assembly}'") 2713 2714 # Scan for availabled databases 2715 log.info( 2716 f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..." 2717 ) 2718 databases_infos_dict = databases_infos( 2719 database_folder_releases=database_releases, 2720 database_formats=database_formats, 2721 assembly=assembly, 2722 config=config, 2723 ) 2724 log.info( 2725 f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found" 2726 ) 2727 2728 return databases_infos_dict
The function scan_databases scans for available databases based on specified formats and
releases.
Parameters
- database_formats: The
database_formatsparameter is a list that specifies the formats of the databases to be scanned. In this case, the accepted format is "parquet" - database_releases: The
database_releasesparameter is a list that specifies the releases of the databases to be scanned. In the provided function, the default value fordatabase_releasesis set to["current"], meaning that by default, the function will scan databases that are in the "current"
Returns
The function
scan_databasesreturns a dictionary containing information about databases that match the specified formats and releases.
2730 def annotation(self) -> None: 2731 """ 2732 It annotates the VCF file with the annotations specified in the config file. 2733 """ 2734 2735 # Config 2736 config = self.get_config() 2737 2738 # Param 2739 param = self.get_param() 2740 2741 # Param - Assembly 2742 assembly = param.get("assembly", config.get("assembly", None)) 2743 if not assembly: 2744 assembly = DEFAULT_ASSEMBLY 2745 log.warning(f"Default assembly '{assembly}'") 2746 2747 # annotations databases folders 2748 annotations_databases = set( 2749 config.get("folders", {}) 2750 .get("databases", {}) 2751 .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER]) 2752 + config.get("folders", {}) 2753 .get("databases", {}) 2754 .get("parquet", ["~/howard/databases/parquet/current"]) 2755 + config.get("folders", {}) 2756 .get("databases", {}) 2757 .get("bcftools", ["~/howard/databases/bcftools/current"]) 2758 ) 2759 2760 # Get param annotations 2761 if param.get("annotations", None) and isinstance( 2762 param.get("annotations", None), str 2763 ): 2764 log.debug(param.get("annotations", None)) 2765 param_annotation_list = param.get("annotations").split(",") 2766 else: 2767 param_annotation_list = [] 2768 2769 # Each tools param 2770 if param.get("annotation_parquet", None) != None: 2771 log.debug( 2772 f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}""" 2773 ) 2774 if isinstance(param.get("annotation_parquet", None), list): 2775 param_annotation_list.append(",".join(param.get("annotation_parquet"))) 2776 else: 2777 param_annotation_list.append(param.get("annotation_parquet")) 2778 if param.get("annotation_snpsift", None) != None: 2779 if isinstance(param.get("annotation_snpsift", None), list): 2780 param_annotation_list.append( 2781 "snpsift:" 2782 + "+".join(param.get("annotation_snpsift")).replace(",", "+") 2783 ) 2784 else: 2785 param_annotation_list.append( 2786 "snpsift:" + param.get("annotation_snpsift").replace(",", "+") 2787 ) 2788 if param.get("annotation_snpeff", None) != None: 2789 param_annotation_list.append("snpeff:" + param.get("annotation_snpeff")) 2790 if param.get("annotation_bcftools", None) != None: 2791 if isinstance(param.get("annotation_bcftools", None), list): 2792 param_annotation_list.append( 2793 "bcftools:" 2794 + "+".join(param.get("annotation_bcftools")).replace(",", "+") 2795 ) 2796 else: 2797 param_annotation_list.append( 2798 "bcftools:" + param.get("annotation_bcftools").replace(",", "+") 2799 ) 2800 if param.get("annotation_annovar", None) != None: 2801 param_annotation_list.append("annovar:" + param.get("annotation_annovar")) 2802 if param.get("annotation_exomiser", None) != None: 2803 param_annotation_list.append("exomiser:" + param.get("annotation_exomiser")) 2804 if param.get("annotation_splice", None) != None: 2805 param_annotation_list.append("splice:" + param.get("annotation_splice")) 2806 2807 # Merge param annotations list 2808 param["annotations"] = ",".join(param_annotation_list) 2809 2810 # debug 2811 log.debug(f"param_annotations={param['annotations']}") 2812 2813 if param.get("annotations"): 2814 2815 # Log 2816 # log.info("Annotations - Check annotation parameters") 2817 2818 if not "annotation" in param: 2819 param["annotation"] = {} 2820 2821 # List of annotations parameters 2822 annotations_list_input = {} 2823 if isinstance(param.get("annotations", None), str): 2824 annotation_file_list = [ 2825 value for value in param.get("annotations", "").split(",") 2826 ] 2827 for annotation_file in annotation_file_list: 2828 annotations_list_input[annotation_file] = {"INFO": None} 2829 else: 2830 annotations_list_input = param.get("annotations", {}) 2831 2832 log.info(f"Quick Annotations:") 2833 for annotation_key in list(annotations_list_input.keys()): 2834 log.info(f" {annotation_key}") 2835 2836 # List of annotations and associated fields 2837 annotations_list = {} 2838 2839 for annotation_file in annotations_list_input: 2840 2841 # Explode annotations if ALL 2842 if ( 2843 annotation_file.upper() == "ALL" 2844 or annotation_file.upper().startswith("ALL:") 2845 ): 2846 2847 # check ALL parameters (formats, releases) 2848 annotation_file_split = annotation_file.split(":") 2849 database_formats = "parquet" 2850 database_releases = "current" 2851 for annotation_file_option in annotation_file_split[1:]: 2852 database_all_options_split = annotation_file_option.split("=") 2853 if database_all_options_split[0] == "format": 2854 database_formats = database_all_options_split[1].split("+") 2855 if database_all_options_split[0] == "release": 2856 database_releases = database_all_options_split[1].split("+") 2857 2858 # Scan for availabled databases 2859 databases_infos_dict = self.scan_databases( 2860 database_formats=database_formats, 2861 database_releases=database_releases, 2862 ) 2863 2864 # Add found databases in annotation parameters 2865 for database_infos in databases_infos_dict.keys(): 2866 annotations_list[database_infos] = {"INFO": None} 2867 2868 else: 2869 annotations_list[annotation_file] = annotations_list_input[ 2870 annotation_file 2871 ] 2872 2873 # Check each databases 2874 if len(annotations_list): 2875 2876 log.info( 2877 f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..." 2878 ) 2879 2880 for annotation_file in annotations_list: 2881 2882 # Init 2883 annotations = annotations_list.get(annotation_file, None) 2884 2885 # Annotation snpEff 2886 if annotation_file.startswith("snpeff"): 2887 2888 log.debug(f"Quick Annotation snpEff") 2889 2890 if "snpeff" not in param["annotation"]: 2891 param["annotation"]["snpeff"] = {} 2892 2893 if "options" not in param["annotation"]["snpeff"]: 2894 param["annotation"]["snpeff"]["options"] = "" 2895 2896 # snpEff options in annotations 2897 param["annotation"]["snpeff"]["options"] = "".join( 2898 annotation_file.split(":")[1:] 2899 ) 2900 2901 # Annotation Annovar 2902 elif annotation_file.startswith("annovar"): 2903 2904 log.debug(f"Quick Annotation Annovar") 2905 2906 if "annovar" not in param["annotation"]: 2907 param["annotation"]["annovar"] = {} 2908 2909 if "annotations" not in param["annotation"]["annovar"]: 2910 param["annotation"]["annovar"]["annotations"] = {} 2911 2912 # Options 2913 annotation_file_split = annotation_file.split(":") 2914 for annotation_file_annotation in annotation_file_split[1:]: 2915 if annotation_file_annotation: 2916 param["annotation"]["annovar"]["annotations"][ 2917 annotation_file_annotation 2918 ] = annotations 2919 2920 # Annotation Exomiser 2921 elif annotation_file.startswith("exomiser"): 2922 2923 log.debug(f"Quick Annotation Exomiser") 2924 2925 param["annotation"]["exomiser"] = params_string_to_dict( 2926 annotation_file 2927 ) 2928 2929 # Annotation Splice 2930 elif annotation_file.startswith("splice"): 2931 2932 log.debug(f"Quick Annotation Splice") 2933 2934 param["annotation"]["splice"] = params_string_to_dict( 2935 annotation_file 2936 ) 2937 2938 # Annotation Parquet or BCFTOOLS 2939 else: 2940 2941 # Tools detection 2942 if annotation_file.startswith("bcftools:"): 2943 annotation_tool_initial = "bcftools" 2944 annotation_file = ":".join(annotation_file.split(":")[1:]) 2945 elif annotation_file.startswith("snpsift:"): 2946 annotation_tool_initial = "snpsift" 2947 annotation_file = ":".join(annotation_file.split(":")[1:]) 2948 else: 2949 annotation_tool_initial = None 2950 2951 # list of files 2952 annotation_file_list = annotation_file.replace("+", ":").split( 2953 ":" 2954 ) 2955 2956 for annotation_file in annotation_file_list: 2957 2958 if annotation_file: 2959 2960 # Annotation tool initial 2961 annotation_tool = annotation_tool_initial 2962 2963 # Find file 2964 annotation_file_found = None 2965 2966 # Expand user 2967 annotation_file = full_path(annotation_file) 2968 2969 if os.path.exists(annotation_file): 2970 annotation_file_found = annotation_file 2971 2972 else: 2973 # Find within assembly folders 2974 for annotations_database in annotations_databases: 2975 found_files = find_all( 2976 annotation_file, 2977 os.path.join( 2978 annotations_database, assembly 2979 ), 2980 ) 2981 if len(found_files) > 0: 2982 annotation_file_found = found_files[0] 2983 break 2984 if not annotation_file_found and not assembly: 2985 # Find within folders 2986 for ( 2987 annotations_database 2988 ) in annotations_databases: 2989 found_files = find_all( 2990 annotation_file, annotations_database 2991 ) 2992 if len(found_files) > 0: 2993 annotation_file_found = found_files[0] 2994 break 2995 log.debug( 2996 f"for {annotation_file} annotation_file_found={annotation_file_found}" 2997 ) 2998 2999 # Full path 3000 annotation_file_found = full_path(annotation_file_found) 3001 3002 if annotation_file_found: 3003 3004 database = Database(database=annotation_file_found) 3005 quick_annotation_format = database.get_format() 3006 quick_annotation_is_compressed = ( 3007 database.is_compressed() 3008 ) 3009 quick_annotation_is_indexed = os.path.exists( 3010 f"{annotation_file_found}.tbi" 3011 ) 3012 bcftools_preference = False 3013 3014 # Check Annotation Tool 3015 if not annotation_tool: 3016 if ( 3017 bcftools_preference 3018 and quick_annotation_format 3019 in ["vcf", "bed"] 3020 and quick_annotation_is_compressed 3021 and quick_annotation_is_indexed 3022 ): 3023 annotation_tool = "bcftools" 3024 elif quick_annotation_format in [ 3025 "vcf", 3026 "bed", 3027 "tsv", 3028 "tsv", 3029 "csv", 3030 "json", 3031 "tbl", 3032 "parquet", 3033 "duckdb", 3034 ]: 3035 annotation_tool = "parquet" 3036 else: 3037 log.error( 3038 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3039 ) 3040 raise ValueError( 3041 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3042 ) 3043 3044 log.debug( 3045 f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}" 3046 ) 3047 3048 # Annotation Tool dispatch 3049 if annotation_tool: 3050 if annotation_tool not in param["annotation"]: 3051 param["annotation"][annotation_tool] = {} 3052 if ( 3053 "annotations" 3054 not in param["annotation"][annotation_tool] 3055 ): 3056 param["annotation"][annotation_tool][ 3057 "annotations" 3058 ] = {} 3059 param["annotation"][annotation_tool][ 3060 "annotations" 3061 ][annotation_file_found] = annotations 3062 3063 else: 3064 log.error( 3065 f"Quick Annotation File {annotation_file} does NOT exist" 3066 ) 3067 3068 self.set_param(param) 3069 3070 if param.get("annotation", None): 3071 log.info("Annotations") 3072 if param.get("annotation", {}).get("parquet", None): 3073 log.info("Annotations 'parquet'...") 3074 self.annotation_parquet() 3075 if param.get("annotation", {}).get("bcftools", None): 3076 log.info("Annotations 'bcftools'...") 3077 self.annotation_bcftools() 3078 if param.get("annotation", {}).get("snpsift", None): 3079 log.info("Annotations 'snpsift'...") 3080 self.annotation_snpsift() 3081 if param.get("annotation", {}).get("annovar", None): 3082 log.info("Annotations 'annovar'...") 3083 self.annotation_annovar() 3084 if param.get("annotation", {}).get("snpeff", None): 3085 log.info("Annotations 'snpeff'...") 3086 self.annotation_snpeff() 3087 if param.get("annotation", {}).get("exomiser", None) is not None: 3088 log.info("Annotations 'exomiser'...") 3089 self.annotation_exomiser() 3090 if param.get("annotation", {}).get("splice", None) is not None: 3091 log.info("Annotations 'splice' ...") 3092 self.annotation_splice() 3093 3094 # Explode INFOS fields into table fields 3095 if self.get_explode_infos(): 3096 self.explode_infos( 3097 prefix=self.get_explode_infos_prefix(), 3098 fields=self.get_explode_infos_fields(), 3099 force=True, 3100 )
It annotates the VCF file with the annotations specified in the config file.
3102 def annotation_snpsift(self, threads: int = None) -> None: 3103 """ 3104 This function annotate with bcftools 3105 3106 :param threads: Number of threads to use 3107 :return: the value of the variable "return_value". 3108 """ 3109 3110 # DEBUG 3111 log.debug("Start annotation with bcftools databases") 3112 3113 # Threads 3114 if not threads: 3115 threads = self.get_threads() 3116 log.debug("Threads: " + str(threads)) 3117 3118 # Config 3119 config = self.get_config() 3120 log.debug("Config: " + str(config)) 3121 3122 # Config - snpSift 3123 snpsift_bin_command = get_bin_command( 3124 bin="SnpSift.jar", 3125 tool="snpsift", 3126 bin_type="jar", 3127 config=config, 3128 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 3129 ) 3130 if not snpsift_bin_command: 3131 msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'" 3132 log.error(msg_err) 3133 raise ValueError(msg_err) 3134 3135 # Config - bcftools 3136 bcftools_bin_command = get_bin_command( 3137 bin="bcftools", 3138 tool="bcftools", 3139 bin_type="bin", 3140 config=config, 3141 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3142 ) 3143 if not bcftools_bin_command: 3144 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3145 log.error(msg_err) 3146 raise ValueError(msg_err) 3147 3148 # Config - BCFTools databases folders 3149 databases_folders = set( 3150 self.get_config() 3151 .get("folders", {}) 3152 .get("databases", {}) 3153 .get("annotations", ["."]) 3154 + self.get_config() 3155 .get("folders", {}) 3156 .get("databases", {}) 3157 .get("bcftools", ["."]) 3158 ) 3159 log.debug("Databases annotations: " + str(databases_folders)) 3160 3161 # Param 3162 annotations = ( 3163 self.get_param() 3164 .get("annotation", {}) 3165 .get("snpsift", {}) 3166 .get("annotations", None) 3167 ) 3168 log.debug("Annotations: " + str(annotations)) 3169 3170 # Assembly 3171 assembly = self.get_param().get( 3172 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3173 ) 3174 3175 # Data 3176 table_variants = self.get_table_variants() 3177 3178 # Check if not empty 3179 log.debug("Check if not empty") 3180 sql_query_chromosomes = ( 3181 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3182 ) 3183 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3184 if not sql_query_chromosomes_df["count"][0]: 3185 log.info(f"VCF empty") 3186 return 3187 3188 # VCF header 3189 vcf_reader = self.get_header() 3190 log.debug("Initial header: " + str(vcf_reader.infos)) 3191 3192 # Existing annotations 3193 for vcf_annotation in self.get_header().infos: 3194 3195 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3196 log.debug( 3197 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3198 ) 3199 3200 if annotations: 3201 3202 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3203 3204 # Export VCF file 3205 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3206 3207 # Init 3208 commands = {} 3209 3210 for annotation in annotations: 3211 annotation_fields = annotations[annotation] 3212 3213 # Annotation Name 3214 annotation_name = os.path.basename(annotation) 3215 3216 if not annotation_fields: 3217 annotation_fields = {"INFO": None} 3218 3219 log.debug(f"Annotation '{annotation_name}'") 3220 log.debug( 3221 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3222 ) 3223 3224 # Create Database 3225 database = Database( 3226 database=annotation, 3227 databases_folders=databases_folders, 3228 assembly=assembly, 3229 ) 3230 3231 # Find files 3232 db_file = database.get_database() 3233 db_file = full_path(db_file) 3234 db_hdr_file = database.get_header_file() 3235 db_hdr_file = full_path(db_hdr_file) 3236 db_file_type = database.get_format() 3237 db_tbi_file = f"{db_file}.tbi" 3238 db_file_compressed = database.is_compressed() 3239 3240 # Check if compressed 3241 if not db_file_compressed: 3242 log.error( 3243 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3244 ) 3245 raise ValueError( 3246 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3247 ) 3248 3249 # Check if indexed 3250 if not os.path.exists(db_tbi_file): 3251 log.error( 3252 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3253 ) 3254 raise ValueError( 3255 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3256 ) 3257 3258 # Check index - try to create if not exists 3259 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3260 log.error("Annotation failed: database not valid") 3261 log.error(f"Annotation annotation file: {db_file}") 3262 log.error(f"Annotation annotation header: {db_hdr_file}") 3263 log.error(f"Annotation annotation index: {db_tbi_file}") 3264 raise ValueError( 3265 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3266 ) 3267 else: 3268 3269 log.debug( 3270 f"Annotation '{annotation}' - file: " 3271 + str(db_file) 3272 + " and " 3273 + str(db_hdr_file) 3274 ) 3275 3276 # Load header as VCF object 3277 db_hdr_vcf = Variants(input=db_hdr_file) 3278 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3279 log.debug( 3280 "Annotation database header: " 3281 + str(db_hdr_vcf_header_infos) 3282 ) 3283 3284 # For all fields in database 3285 annotation_fields_full = False 3286 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3287 annotation_fields = { 3288 key: key for key in db_hdr_vcf_header_infos 3289 } 3290 log.debug( 3291 "Annotation database header - All annotations added: " 3292 + str(annotation_fields) 3293 ) 3294 annotation_fields_full = True 3295 3296 # # Create file for field rename 3297 # log.debug("Create file for field rename") 3298 # tmp_rename = NamedTemporaryFile( 3299 # prefix=self.get_prefix(), 3300 # dir=self.get_tmp_dir(), 3301 # suffix=".rename", 3302 # delete=False, 3303 # ) 3304 # tmp_rename_name = tmp_rename.name 3305 # tmp_files.append(tmp_rename_name) 3306 3307 # Number of fields 3308 nb_annotation_field = 0 3309 annotation_list = [] 3310 annotation_infos_rename_list = [] 3311 3312 for annotation_field in annotation_fields: 3313 3314 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3315 annotation_fields_new_name = annotation_fields.get( 3316 annotation_field, annotation_field 3317 ) 3318 if not annotation_fields_new_name: 3319 annotation_fields_new_name = annotation_field 3320 3321 # Check if field is in DB and if field is not elready in input data 3322 if ( 3323 annotation_field in db_hdr_vcf.get_header().infos 3324 and annotation_fields_new_name 3325 not in self.get_header().infos 3326 ): 3327 3328 log.info( 3329 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3330 ) 3331 3332 # BCFTools annotate param to rename fields 3333 if annotation_field != annotation_fields_new_name: 3334 annotation_infos_rename_list.append( 3335 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3336 ) 3337 3338 # Add INFO field to header 3339 db_hdr_vcf_header_infos_number = ( 3340 db_hdr_vcf_header_infos[annotation_field].num or "." 3341 ) 3342 db_hdr_vcf_header_infos_type = ( 3343 db_hdr_vcf_header_infos[annotation_field].type 3344 or "String" 3345 ) 3346 db_hdr_vcf_header_infos_description = ( 3347 db_hdr_vcf_header_infos[annotation_field].desc 3348 or f"{annotation_field} description" 3349 ) 3350 db_hdr_vcf_header_infos_source = ( 3351 db_hdr_vcf_header_infos[annotation_field].source 3352 or "unknown" 3353 ) 3354 db_hdr_vcf_header_infos_version = ( 3355 db_hdr_vcf_header_infos[annotation_field].version 3356 or "unknown" 3357 ) 3358 3359 vcf_reader.infos[annotation_fields_new_name] = ( 3360 vcf.parser._Info( 3361 annotation_fields_new_name, 3362 db_hdr_vcf_header_infos_number, 3363 db_hdr_vcf_header_infos_type, 3364 db_hdr_vcf_header_infos_description, 3365 db_hdr_vcf_header_infos_source, 3366 db_hdr_vcf_header_infos_version, 3367 self.code_type_map[ 3368 db_hdr_vcf_header_infos_type 3369 ], 3370 ) 3371 ) 3372 3373 annotation_list.append(annotation_field) 3374 3375 nb_annotation_field += 1 3376 3377 else: 3378 3379 if ( 3380 annotation_field 3381 not in db_hdr_vcf.get_header().infos 3382 ): 3383 log.warning( 3384 f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file" 3385 ) 3386 if ( 3387 annotation_fields_new_name 3388 in self.get_header().infos 3389 ): 3390 log.warning( 3391 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)" 3392 ) 3393 3394 log.info( 3395 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3396 ) 3397 3398 annotation_infos = ",".join(annotation_list) 3399 3400 if annotation_infos != "": 3401 3402 # Annotated VCF (and error file) 3403 tmp_annotation_vcf_name = os.path.join( 3404 tmp_dir, os.path.basename(annotation) + ".vcf.gz" 3405 ) 3406 tmp_annotation_vcf_name_err = ( 3407 tmp_annotation_vcf_name + ".err" 3408 ) 3409 3410 # Add fields to annotate 3411 if not annotation_fields_full: 3412 annotation_infos_option = f"-info {annotation_infos}" 3413 else: 3414 annotation_infos_option = "" 3415 3416 # Info fields rename 3417 if annotation_infos_rename_list: 3418 annotation_infos_rename = " -c " + ",".join( 3419 annotation_infos_rename_list 3420 ) 3421 else: 3422 annotation_infos_rename = "" 3423 3424 # Annotate command 3425 command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3426 3427 # Add command 3428 commands[command_annotate] = tmp_annotation_vcf_name 3429 3430 if commands: 3431 3432 # Export VCF file 3433 self.export_variant_vcf( 3434 vcf_file=tmp_vcf_name, 3435 remove_info=True, 3436 add_samples=False, 3437 index=True, 3438 ) 3439 shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf") 3440 3441 # Num command 3442 nb_command = 0 3443 3444 # Annotate 3445 for command_annotate in commands: 3446 nb_command += 1 3447 log.info( 3448 f"Annotation - Annotate [{nb_command}/{len(commands)}]..." 3449 ) 3450 log.debug(f"command_annotate={command_annotate}") 3451 run_parallel_commands([command_annotate], threads) 3452 3453 # Debug 3454 shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf") 3455 3456 # Update variants 3457 log.info( 3458 f"Annotation - Updating [{nb_command}/{len(commands)}]..." 3459 ) 3460 self.update_from_vcf(commands[command_annotate])
This function annotate with bcftools
Parameters
- threads: Number of threads to use
Returns
the value of the variable "return_value".
3462 def annotation_bcftools(self, threads: int = None) -> None: 3463 """ 3464 This function annotate with bcftools 3465 3466 :param threads: Number of threads to use 3467 :return: the value of the variable "return_value". 3468 """ 3469 3470 # DEBUG 3471 log.debug("Start annotation with bcftools databases") 3472 3473 # Threads 3474 if not threads: 3475 threads = self.get_threads() 3476 log.debug("Threads: " + str(threads)) 3477 3478 # Config 3479 config = self.get_config() 3480 log.debug("Config: " + str(config)) 3481 3482 # DEBUG 3483 delete_tmp = True 3484 if self.get_config().get("verbosity", "warning") in ["debug"]: 3485 delete_tmp = False 3486 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 3487 3488 # Config - BCFTools bin command 3489 bcftools_bin_command = get_bin_command( 3490 bin="bcftools", 3491 tool="bcftools", 3492 bin_type="bin", 3493 config=config, 3494 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3495 ) 3496 if not bcftools_bin_command: 3497 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3498 log.error(msg_err) 3499 raise ValueError(msg_err) 3500 3501 # Config - BCFTools databases folders 3502 databases_folders = set( 3503 self.get_config() 3504 .get("folders", {}) 3505 .get("databases", {}) 3506 .get("annotations", ["."]) 3507 + self.get_config() 3508 .get("folders", {}) 3509 .get("databases", {}) 3510 .get("bcftools", ["."]) 3511 ) 3512 log.debug("Databases annotations: " + str(databases_folders)) 3513 3514 # Param 3515 annotations = ( 3516 self.get_param() 3517 .get("annotation", {}) 3518 .get("bcftools", {}) 3519 .get("annotations", None) 3520 ) 3521 log.debug("Annotations: " + str(annotations)) 3522 3523 # Assembly 3524 assembly = self.get_param().get( 3525 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3526 ) 3527 3528 # Data 3529 table_variants = self.get_table_variants() 3530 3531 # Check if not empty 3532 log.debug("Check if not empty") 3533 sql_query_chromosomes = ( 3534 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3535 ) 3536 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3537 if not sql_query_chromosomes_df["count"][0]: 3538 log.info(f"VCF empty") 3539 return 3540 3541 # Export in VCF 3542 log.debug("Create initial file to annotate") 3543 tmp_vcf = NamedTemporaryFile( 3544 prefix=self.get_prefix(), 3545 dir=self.get_tmp_dir(), 3546 suffix=".vcf.gz", 3547 delete=False, 3548 ) 3549 tmp_vcf_name = tmp_vcf.name 3550 3551 # VCF header 3552 vcf_reader = self.get_header() 3553 log.debug("Initial header: " + str(vcf_reader.infos)) 3554 3555 # Existing annotations 3556 for vcf_annotation in self.get_header().infos: 3557 3558 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3559 log.debug( 3560 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3561 ) 3562 3563 if annotations: 3564 3565 tmp_ann_vcf_list = [] 3566 commands = [] 3567 tmp_files = [] 3568 err_files = [] 3569 3570 for annotation in annotations: 3571 annotation_fields = annotations[annotation] 3572 3573 # Annotation Name 3574 annotation_name = os.path.basename(annotation) 3575 3576 if not annotation_fields: 3577 annotation_fields = {"INFO": None} 3578 3579 log.debug(f"Annotation '{annotation_name}'") 3580 log.debug( 3581 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3582 ) 3583 3584 # Create Database 3585 database = Database( 3586 database=annotation, 3587 databases_folders=databases_folders, 3588 assembly=assembly, 3589 ) 3590 3591 # Find files 3592 db_file = database.get_database() 3593 db_file = full_path(db_file) 3594 db_hdr_file = database.get_header_file() 3595 db_hdr_file = full_path(db_hdr_file) 3596 db_file_type = database.get_format() 3597 db_tbi_file = f"{db_file}.tbi" 3598 db_file_compressed = database.is_compressed() 3599 3600 # Check if compressed 3601 if not db_file_compressed: 3602 log.error( 3603 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3604 ) 3605 raise ValueError( 3606 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3607 ) 3608 3609 # Check if indexed 3610 if not os.path.exists(db_tbi_file): 3611 log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file") 3612 raise ValueError( 3613 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3614 ) 3615 3616 # Check index - try to create if not exists 3617 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3618 log.error("Annotation failed: database not valid") 3619 log.error(f"Annotation annotation file: {db_file}") 3620 log.error(f"Annotation annotation header: {db_hdr_file}") 3621 log.error(f"Annotation annotation index: {db_tbi_file}") 3622 raise ValueError( 3623 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3624 ) 3625 else: 3626 3627 log.debug( 3628 f"Annotation '{annotation}' - file: " 3629 + str(db_file) 3630 + " and " 3631 + str(db_hdr_file) 3632 ) 3633 3634 # Load header as VCF object 3635 db_hdr_vcf = Variants(input=db_hdr_file) 3636 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3637 log.debug( 3638 "Annotation database header: " + str(db_hdr_vcf_header_infos) 3639 ) 3640 3641 # For all fields in database 3642 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3643 annotation_fields = { 3644 key: key for key in db_hdr_vcf_header_infos 3645 } 3646 log.debug( 3647 "Annotation database header - All annotations added: " 3648 + str(annotation_fields) 3649 ) 3650 3651 # Number of fields 3652 nb_annotation_field = 0 3653 annotation_list = [] 3654 3655 for annotation_field in annotation_fields: 3656 3657 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3658 annotation_fields_new_name = annotation_fields.get( 3659 annotation_field, annotation_field 3660 ) 3661 if not annotation_fields_new_name: 3662 annotation_fields_new_name = annotation_field 3663 3664 # Check if field is in DB and if field is not elready in input data 3665 if ( 3666 annotation_field in db_hdr_vcf.get_header().infos 3667 and annotation_fields_new_name 3668 not in self.get_header().infos 3669 ): 3670 3671 log.info( 3672 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3673 ) 3674 3675 # Add INFO field to header 3676 db_hdr_vcf_header_infos_number = ( 3677 db_hdr_vcf_header_infos[annotation_field].num or "." 3678 ) 3679 db_hdr_vcf_header_infos_type = ( 3680 db_hdr_vcf_header_infos[annotation_field].type 3681 or "String" 3682 ) 3683 db_hdr_vcf_header_infos_description = ( 3684 db_hdr_vcf_header_infos[annotation_field].desc 3685 or f"{annotation_field} description" 3686 ) 3687 db_hdr_vcf_header_infos_source = ( 3688 db_hdr_vcf_header_infos[annotation_field].source 3689 or "unknown" 3690 ) 3691 db_hdr_vcf_header_infos_version = ( 3692 db_hdr_vcf_header_infos[annotation_field].version 3693 or "unknown" 3694 ) 3695 3696 vcf_reader.infos[annotation_fields_new_name] = ( 3697 vcf.parser._Info( 3698 annotation_fields_new_name, 3699 db_hdr_vcf_header_infos_number, 3700 db_hdr_vcf_header_infos_type, 3701 db_hdr_vcf_header_infos_description, 3702 db_hdr_vcf_header_infos_source, 3703 db_hdr_vcf_header_infos_version, 3704 self.code_type_map[db_hdr_vcf_header_infos_type], 3705 ) 3706 ) 3707 3708 # annotation_list.append(annotation_field) 3709 if annotation_field != annotation_fields_new_name: 3710 annotation_list.append( 3711 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3712 ) 3713 else: 3714 annotation_list.append(annotation_field) 3715 3716 nb_annotation_field += 1 3717 3718 else: 3719 3720 if annotation_field not in db_hdr_vcf.get_header().infos: 3721 log.warning( 3722 f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file" 3723 ) 3724 if annotation_fields_new_name in self.get_header().infos: 3725 log.warning( 3726 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 3727 ) 3728 3729 log.info( 3730 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3731 ) 3732 3733 annotation_infos = ",".join(annotation_list) 3734 3735 if annotation_infos != "": 3736 3737 # Protect header for bcftools (remove "#CHROM" and variants line) 3738 log.debug("Protect Header file - remove #CHROM line if exists") 3739 tmp_header_vcf = NamedTemporaryFile( 3740 prefix=self.get_prefix(), 3741 dir=self.get_tmp_dir(), 3742 suffix=".hdr", 3743 delete=False, 3744 ) 3745 tmp_header_vcf_name = tmp_header_vcf.name 3746 tmp_files.append(tmp_header_vcf_name) 3747 # Command 3748 if db_hdr_file.endswith(".gz"): 3749 command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 3750 else: 3751 command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 3752 # Run 3753 run_parallel_commands([command_extract_header], 1) 3754 3755 # Find chomosomes 3756 log.debug("Find chromosomes ") 3757 sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\"""" 3758 sql_query_chromosomes_df = self.get_query_to_df( 3759 sql_query_chromosomes 3760 ) 3761 chomosomes_list = list(sql_query_chromosomes_df["CHROM"]) 3762 3763 log.debug("Chromosomes found: " + str(list(chomosomes_list))) 3764 3765 # BED columns in the annotation file 3766 if db_file_type in ["bed"]: 3767 annotation_infos = "CHROM,POS,POS," + annotation_infos 3768 3769 for chrom in chomosomes_list: 3770 3771 # Create BED on initial VCF 3772 log.debug("Create BED on initial VCF: " + str(tmp_vcf_name)) 3773 tmp_bed = NamedTemporaryFile( 3774 prefix=self.get_prefix(), 3775 dir=self.get_tmp_dir(), 3776 suffix=".bed", 3777 delete=False, 3778 ) 3779 tmp_bed_name = tmp_bed.name 3780 tmp_files.append(tmp_bed_name) 3781 3782 # Detecte regions 3783 log.debug( 3784 f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..." 3785 ) 3786 window = 1000000 3787 sql_query_intervals_for_bed = f""" 3788 SELECT \"#CHROM\", 3789 CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END, 3790 \"POS\"+{window} 3791 FROM {table_variants} as table_variants 3792 WHERE table_variants.\"#CHROM\" = '{chrom}' 3793 """ 3794 regions = self.conn.execute( 3795 sql_query_intervals_for_bed 3796 ).fetchall() 3797 merged_regions = merge_regions(regions) 3798 log.debug( 3799 f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..." 3800 ) 3801 3802 header = ["#CHROM", "START", "END"] 3803 with open(tmp_bed_name, "w") as f: 3804 # Write the header with tab delimiter 3805 f.write("\t".join(header) + "\n") 3806 for d in merged_regions: 3807 # Write each data row with tab delimiter 3808 f.write("\t".join(map(str, d)) + "\n") 3809 3810 # Tmp files 3811 tmp_annotation_vcf = NamedTemporaryFile( 3812 prefix=self.get_prefix(), 3813 dir=self.get_tmp_dir(), 3814 suffix=".vcf.gz", 3815 delete=False, 3816 ) 3817 tmp_annotation_vcf_name = tmp_annotation_vcf.name 3818 tmp_files.append(tmp_annotation_vcf_name) 3819 tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}") 3820 tmp_annotation_vcf_name_err = ( 3821 tmp_annotation_vcf_name + ".err" 3822 ) 3823 err_files.append(tmp_annotation_vcf_name_err) 3824 3825 # Annotate Command 3826 log.debug( 3827 f"Annotation '{annotation}' - add bcftools command" 3828 ) 3829 3830 # Command 3831 command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3832 3833 # Add command 3834 commands.append(command_annotate) 3835 3836 # if some commands 3837 if commands: 3838 3839 # Export VCF file 3840 self.export_variant_vcf( 3841 vcf_file=tmp_vcf_name, 3842 remove_info=True, 3843 add_samples=False, 3844 index=True, 3845 ) 3846 3847 # Threads 3848 # calculate threads for annotated commands 3849 if commands: 3850 threads_bcftools_annotate = round(threads / len(commands)) 3851 else: 3852 threads_bcftools_annotate = 1 3853 3854 if not threads_bcftools_annotate: 3855 threads_bcftools_annotate = 1 3856 3857 # Add threads option to bcftools commands 3858 if threads_bcftools_annotate > 1: 3859 commands_threaded = [] 3860 for command in commands: 3861 commands_threaded.append( 3862 command.replace( 3863 f"{bcftools_bin_command} annotate ", 3864 f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ", 3865 ) 3866 ) 3867 commands = commands_threaded 3868 3869 # Command annotation multithreading 3870 log.debug(f"Annotation - Annotation commands: " + str(commands)) 3871 log.info( 3872 f"Annotation - Annotation multithreaded in " 3873 + str(len(commands)) 3874 + " commands" 3875 ) 3876 3877 run_parallel_commands(commands, threads) 3878 3879 # Merge 3880 tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list) 3881 3882 if tmp_ann_vcf_list_cmd: 3883 3884 # Tmp file 3885 tmp_annotate_vcf = NamedTemporaryFile( 3886 prefix=self.get_prefix(), 3887 dir=self.get_tmp_dir(), 3888 suffix=".vcf.gz", 3889 delete=True, 3890 ) 3891 tmp_annotate_vcf_name = tmp_annotate_vcf.name 3892 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 3893 err_files.append(tmp_annotate_vcf_name_err) 3894 3895 # Tmp file remove command 3896 tmp_files_remove_command = "" 3897 if tmp_files: 3898 tmp_files_remove_command = " && rm -f " + " ".join(tmp_files) 3899 3900 # Command merge 3901 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}" 3902 log.info( 3903 f"Annotation - Annotation merging " 3904 + str(len(commands)) 3905 + " annotated files" 3906 ) 3907 log.debug(f"Annotation - merge command: {merge_command}") 3908 run_parallel_commands([merge_command], 1) 3909 3910 # Error messages 3911 log.info(f"Error/Warning messages:") 3912 error_message_command_all = [] 3913 error_message_command_warning = [] 3914 error_message_command_err = [] 3915 for err_file in err_files: 3916 with open(err_file, "r") as f: 3917 for line in f: 3918 message = line.strip() 3919 error_message_command_all.append(message) 3920 if line.startswith("[W::"): 3921 error_message_command_warning.append(message) 3922 if line.startswith("[E::"): 3923 error_message_command_err.append( 3924 f"{err_file}: " + message 3925 ) 3926 # log info 3927 for message in list( 3928 set(error_message_command_err + error_message_command_warning) 3929 ): 3930 log.info(f" {message}") 3931 # debug info 3932 for message in list(set(error_message_command_all)): 3933 log.debug(f" {message}") 3934 # failed 3935 if len(error_message_command_err): 3936 log.error("Annotation failed: Error in commands") 3937 raise ValueError("Annotation failed: Error in commands") 3938 3939 # Update variants 3940 log.info(f"Annotation - Updating...") 3941 self.update_from_vcf(tmp_annotate_vcf_name)
This function annotate with bcftools
Parameters
- threads: Number of threads to use
Returns
the value of the variable "return_value".
3943 def annotation_exomiser(self, threads: int = None) -> None: 3944 """ 3945 This function annotate with Exomiser 3946 3947 This function uses args as parameters, in section "annotation" -> "exomiser", with sections: 3948 - "analysis" (dict/file): 3949 Full analysis dictionnary parameters (see Exomiser docs). 3950 Either a dict, or a file in JSON or YAML format. 3951 These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) 3952 Default : None 3953 - "preset" (string): 3954 Analysis preset (available in config folder). 3955 Used if no full "analysis" is provided. 3956 Default: "exome" 3957 - "phenopacket" (dict/file): 3958 Samples and phenotipic features parameters (see Exomiser docs). 3959 Either a dict, or a file in JSON or YAML format. 3960 Default: None 3961 - "subject" (dict): 3962 Sample parameters (see Exomiser docs). 3963 Example: 3964 "subject": 3965 { 3966 "id": "ISDBM322017", 3967 "sex": "FEMALE" 3968 } 3969 Default: None 3970 - "sample" (string): 3971 Sample name to construct "subject" section: 3972 "subject": 3973 { 3974 "id": "<sample>", 3975 "sex": "UNKNOWN_SEX" 3976 } 3977 Default: None 3978 - "phenotypicFeatures" (dict) 3979 Phenotypic features to construct "subject" section. 3980 Example: 3981 "phenotypicFeatures": 3982 [ 3983 { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, 3984 { "type": { "id": "HP:0000486", "label": "Strabismus" } } 3985 ] 3986 - "hpo" (list) 3987 List of HPO ids as phenotypic features. 3988 Example: 3989 "hpo": ['0001156', '0001363', '0011304', '0010055'] 3990 Default: [] 3991 - "outputOptions" (dict): 3992 Output options (see Exomiser docs). 3993 Default: 3994 "output_options" = 3995 { 3996 "outputContributingVariantsOnly": False, 3997 "numGenes": 0, 3998 "outputFormats": ["TSV_VARIANT", "VCF"] 3999 } 4000 - "transcript_source" (string): 4001 Transcript source (either "refseq", "ucsc", "ensembl") 4002 Default: "refseq" 4003 - "exomiser_to_info" (boolean): 4004 Add exomiser TSV file columns as INFO fields in VCF. 4005 Default: False 4006 - "release" (string): 4007 Exomise database release. 4008 If not exists, database release will be downloaded (take a while). 4009 Default: None (provided by application.properties configuration file) 4010 - "exomiser_application_properties" (file): 4011 Exomiser configuration file (see Exomiser docs). 4012 Useful to automatically download databases (especially for specific genome databases). 4013 4014 Notes: 4015 - If no sample in parameters, first sample in VCF will be chosen 4016 - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off 4017 4018 :param threads: The number of threads to use 4019 :return: None. 4020 """ 4021 4022 # DEBUG 4023 log.debug("Start annotation with Exomiser databases") 4024 4025 # Threads 4026 if not threads: 4027 threads = self.get_threads() 4028 log.debug("Threads: " + str(threads)) 4029 4030 # Config 4031 config = self.get_config() 4032 log.debug("Config: " + str(config)) 4033 4034 # Config - Folders - Databases 4035 databases_folders = ( 4036 config.get("folders", {}) 4037 .get("databases", {}) 4038 .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current") 4039 ) 4040 databases_folders = full_path(databases_folders) 4041 if not os.path.exists(databases_folders): 4042 log.error(f"Databases annotations: {databases_folders} NOT found") 4043 log.debug("Databases annotations: " + str(databases_folders)) 4044 4045 # Config - Exomiser 4046 exomiser_bin_command = get_bin_command( 4047 bin="exomiser-cli*.jar", 4048 tool="exomiser", 4049 bin_type="jar", 4050 config=config, 4051 default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser", 4052 ) 4053 log.debug("Exomiser bin command: " + str(exomiser_bin_command)) 4054 if not exomiser_bin_command: 4055 msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'" 4056 log.error(msg_err) 4057 raise ValueError(msg_err) 4058 4059 # Param 4060 param = self.get_param() 4061 log.debug("Param: " + str(param)) 4062 4063 # Param - Exomiser 4064 param_exomiser = param.get("annotation", {}).get("exomiser", {}) 4065 log.debug(f"Param Exomiser: {param_exomiser}") 4066 4067 # Param - Assembly 4068 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4069 log.debug("Assembly: " + str(assembly)) 4070 4071 # Data 4072 table_variants = self.get_table_variants() 4073 4074 # Check if not empty 4075 log.debug("Check if not empty") 4076 sql_query_chromosomes = ( 4077 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4078 ) 4079 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4080 log.info(f"VCF empty") 4081 return False 4082 4083 # VCF header 4084 vcf_reader = self.get_header() 4085 log.debug("Initial header: " + str(vcf_reader.infos)) 4086 4087 # Samples 4088 samples = self.get_header_sample_list() 4089 if not samples: 4090 log.error("No Samples in VCF") 4091 return False 4092 log.debug(f"Samples: {samples}") 4093 4094 # Memory limit 4095 memory_limit = self.get_memory("8G") 4096 log.debug(f"memory_limit: {memory_limit}") 4097 4098 # Exomiser java options 4099 exomiser_java_options = ( 4100 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4101 ) 4102 log.debug(f"Exomiser java options: {exomiser_java_options}") 4103 4104 # Download Exomiser (if not exists) 4105 exomiser_release = param_exomiser.get("release", None) 4106 exomiser_application_properties = param_exomiser.get( 4107 "exomiser_application_properties", None 4108 ) 4109 databases_download_exomiser( 4110 assemblies=[assembly], 4111 exomiser_folder=databases_folders, 4112 exomiser_release=exomiser_release, 4113 exomiser_phenotype_release=exomiser_release, 4114 exomiser_application_properties=exomiser_application_properties, 4115 ) 4116 4117 # Force annotation 4118 force_update_annotation = True 4119 4120 if "Exomiser" not in self.get_header().infos or force_update_annotation: 4121 log.debug("Start annotation Exomiser") 4122 4123 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 4124 4125 # tmp_dir = "/tmp/exomiser" 4126 4127 ### ANALYSIS ### 4128 ################ 4129 4130 # Create analysis.json through analysis dict 4131 # either analysis in param or by default 4132 # depending on preset exome/genome) 4133 4134 # Init analysis dict 4135 param_exomiser_analysis_dict = {} 4136 4137 # analysis from param 4138 param_exomiser_analysis = param_exomiser.get("analysis", {}) 4139 param_exomiser_analysis = full_path(param_exomiser_analysis) 4140 4141 # If analysis in param -> load anlaysis json 4142 if param_exomiser_analysis: 4143 4144 # If param analysis is a file and exists 4145 if isinstance(param_exomiser_analysis, str) and os.path.exists( 4146 param_exomiser_analysis 4147 ): 4148 # Load analysis file into analysis dict (either yaml or json) 4149 with open(param_exomiser_analysis) as json_file: 4150 param_exomiser_analysis_dict = yaml.safe_load(json_file) 4151 4152 # If param analysis is a dict 4153 elif isinstance(param_exomiser_analysis, dict): 4154 # Load analysis dict into analysis dict (either yaml or json) 4155 param_exomiser_analysis_dict = param_exomiser_analysis 4156 4157 # Error analysis type 4158 else: 4159 log.error(f"Analysis type unknown. Check param file.") 4160 raise ValueError(f"Analysis type unknown. Check param file.") 4161 4162 # Case no input analysis config file/dict 4163 # Use preset (exome/genome) to open default config file 4164 if not param_exomiser_analysis_dict: 4165 4166 # default preset 4167 default_preset = "exome" 4168 4169 # Get param preset or default preset 4170 param_exomiser_preset = param_exomiser.get("preset", default_preset) 4171 4172 # Try to find if preset is a file 4173 if os.path.exists(param_exomiser_preset): 4174 # Preset file is provided in full path 4175 param_exomiser_analysis_default_config_file = ( 4176 param_exomiser_preset 4177 ) 4178 # elif os.path.exists(full_path(param_exomiser_preset)): 4179 # # Preset file is provided in full path 4180 # param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset) 4181 elif os.path.exists( 4182 os.path.join(folder_config, param_exomiser_preset) 4183 ): 4184 # Preset file is provided a basename in config folder (can be a path with subfolders) 4185 param_exomiser_analysis_default_config_file = os.path.join( 4186 folder_config, param_exomiser_preset 4187 ) 4188 else: 4189 # Construct preset file 4190 param_exomiser_analysis_default_config_file = os.path.join( 4191 folder_config, 4192 f"preset-{param_exomiser_preset}-analysis.json", 4193 ) 4194 4195 # If preset file exists 4196 param_exomiser_analysis_default_config_file = full_path( 4197 param_exomiser_analysis_default_config_file 4198 ) 4199 if os.path.exists(param_exomiser_analysis_default_config_file): 4200 # Load prest file into analysis dict (either yaml or json) 4201 with open( 4202 param_exomiser_analysis_default_config_file 4203 ) as json_file: 4204 # param_exomiser_analysis_dict[""] = json.load(json_file) 4205 param_exomiser_analysis_dict["analysis"] = yaml.safe_load( 4206 json_file 4207 ) 4208 4209 # Error preset file 4210 else: 4211 log.error( 4212 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4213 ) 4214 raise ValueError( 4215 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4216 ) 4217 4218 # If no analysis dict created 4219 if not param_exomiser_analysis_dict: 4220 log.error(f"No analysis config") 4221 raise ValueError(f"No analysis config") 4222 4223 # Log 4224 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4225 4226 ### PHENOPACKET ### 4227 ################### 4228 4229 # If no PhenoPacket in analysis dict -> check in param 4230 if "phenopacket" not in param_exomiser_analysis_dict: 4231 4232 # If PhenoPacket in param -> load anlaysis json 4233 if param_exomiser.get("phenopacket", None): 4234 4235 param_exomiser_phenopacket = param_exomiser.get("phenopacket") 4236 param_exomiser_phenopacket = full_path( 4237 param_exomiser_phenopacket 4238 ) 4239 4240 # If param phenopacket is a file and exists 4241 if isinstance( 4242 param_exomiser_phenopacket, str 4243 ) and os.path.exists(param_exomiser_phenopacket): 4244 # Load phenopacket file into analysis dict (either yaml or json) 4245 with open(param_exomiser_phenopacket) as json_file: 4246 param_exomiser_analysis_dict["phenopacket"] = ( 4247 yaml.safe_load(json_file) 4248 ) 4249 4250 # If param phenopacket is a dict 4251 elif isinstance(param_exomiser_phenopacket, dict): 4252 # Load phenopacket dict into analysis dict (either yaml or json) 4253 param_exomiser_analysis_dict["phenopacket"] = ( 4254 param_exomiser_phenopacket 4255 ) 4256 4257 # Error phenopacket type 4258 else: 4259 log.error(f"Phenopacket type unknown. Check param file.") 4260 raise ValueError( 4261 f"Phenopacket type unknown. Check param file." 4262 ) 4263 4264 # If no PhenoPacket in analysis dict -> construct from sample and HPO in param 4265 if "phenopacket" not in param_exomiser_analysis_dict: 4266 4267 # Init PhenoPacket 4268 param_exomiser_analysis_dict["phenopacket"] = { 4269 "id": "analysis", 4270 "proband": {}, 4271 } 4272 4273 ### Add subject ### 4274 4275 # If subject exists 4276 param_exomiser_subject = param_exomiser.get("subject", {}) 4277 4278 # If subject not exists -> found sample ID 4279 if not param_exomiser_subject: 4280 4281 # Found sample ID in param 4282 sample = param_exomiser.get("sample", None) 4283 4284 # Find sample ID (first sample) 4285 if not sample: 4286 sample_list = self.get_header_sample_list() 4287 if len(sample_list) > 0: 4288 sample = sample_list[0] 4289 else: 4290 log.error(f"No sample found") 4291 raise ValueError(f"No sample found") 4292 4293 # Create subject 4294 param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"} 4295 4296 # Add to dict 4297 param_exomiser_analysis_dict["phenopacket"][ 4298 "subject" 4299 ] = param_exomiser_subject 4300 4301 ### Add "phenotypicFeatures" ### 4302 4303 # If phenotypicFeatures exists 4304 param_exomiser_phenotypicfeatures = param_exomiser.get( 4305 "phenotypicFeatures", [] 4306 ) 4307 4308 # If phenotypicFeatures not exists -> Try to infer from hpo list 4309 if not param_exomiser_phenotypicfeatures: 4310 4311 # Found HPO in param 4312 param_exomiser_hpo = param_exomiser.get("hpo", []) 4313 4314 # Split HPO if list in string format separated by comma 4315 if isinstance(param_exomiser_hpo, str): 4316 param_exomiser_hpo = param_exomiser_hpo.split(",") 4317 4318 # Create HPO list 4319 for hpo in param_exomiser_hpo: 4320 hpo_clean = re.sub("[^0-9]", "", hpo) 4321 param_exomiser_phenotypicfeatures.append( 4322 { 4323 "type": { 4324 "id": f"HP:{hpo_clean}", 4325 "label": f"HP:{hpo_clean}", 4326 } 4327 } 4328 ) 4329 4330 # Add to dict 4331 param_exomiser_analysis_dict["phenopacket"][ 4332 "phenotypicFeatures" 4333 ] = param_exomiser_phenotypicfeatures 4334 4335 # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step 4336 if not param_exomiser_phenotypicfeatures: 4337 for step in param_exomiser_analysis_dict.get( 4338 "analysis", {} 4339 ).get("steps", []): 4340 if "hiPhivePrioritiser" in step: 4341 param_exomiser_analysis_dict.get("analysis", {}).get( 4342 "steps", [] 4343 ).remove(step) 4344 4345 ### Add Input File ### 4346 4347 # Initial file name and htsFiles 4348 tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz") 4349 param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [ 4350 { 4351 "uri": tmp_vcf_name, 4352 "htsFormat": "VCF", 4353 "genomeAssembly": assembly, 4354 } 4355 ] 4356 4357 ### Add metaData ### 4358 4359 # If metaData not in analysis dict 4360 if "metaData" not in param_exomiser_analysis_dict: 4361 param_exomiser_analysis_dict["phenopacket"]["metaData"] = { 4362 "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z", 4363 "createdBy": "howard", 4364 "phenopacketSchemaVersion": 1, 4365 } 4366 4367 ### OutputOptions ### 4368 4369 # Init output result folder 4370 output_results = os.path.join(tmp_dir, "results") 4371 4372 # If no outputOptions in analysis dict 4373 if "outputOptions" not in param_exomiser_analysis_dict: 4374 4375 # default output formats 4376 defaut_output_formats = ["TSV_VARIANT", "VCF"] 4377 4378 # Get outputOptions in param 4379 output_options = param_exomiser.get("outputOptions", None) 4380 4381 # If no output_options in param -> check 4382 if not output_options: 4383 output_options = { 4384 "outputContributingVariantsOnly": False, 4385 "numGenes": 0, 4386 "outputFormats": defaut_output_formats, 4387 } 4388 4389 # Replace outputDirectory in output options 4390 output_options["outputDirectory"] = output_results 4391 output_options["outputFileName"] = "howard" 4392 4393 # Add outputOptions in analysis dict 4394 param_exomiser_analysis_dict["outputOptions"] = output_options 4395 4396 else: 4397 4398 # Replace output_results and output format (if exists in param) 4399 param_exomiser_analysis_dict["outputOptions"][ 4400 "outputDirectory" 4401 ] = output_results 4402 param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = ( 4403 list( 4404 set( 4405 param_exomiser_analysis_dict.get( 4406 "outputOptions", {} 4407 ).get("outputFormats", []) 4408 + ["TSV_VARIANT", "VCF"] 4409 ) 4410 ) 4411 ) 4412 4413 # log 4414 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4415 4416 ### ANALYSIS FILE ### 4417 ##################### 4418 4419 ### Full JSON analysis config file ### 4420 4421 exomiser_analysis = os.path.join(tmp_dir, "analysis.json") 4422 with open(exomiser_analysis, "w") as fp: 4423 json.dump(param_exomiser_analysis_dict, fp, indent=4) 4424 4425 ### SPLIT analysis and sample config files 4426 4427 # Splitted analysis dict 4428 param_exomiser_analysis_dict_for_split = ( 4429 param_exomiser_analysis_dict.copy() 4430 ) 4431 4432 # Phenopacket JSON file 4433 exomiser_analysis_phenopacket = os.path.join( 4434 tmp_dir, "analysis_phenopacket.json" 4435 ) 4436 with open(exomiser_analysis_phenopacket, "w") as fp: 4437 json.dump( 4438 param_exomiser_analysis_dict_for_split.get("phenopacket"), 4439 fp, 4440 indent=4, 4441 ) 4442 4443 # Analysis JSON file without Phenopacket parameters 4444 param_exomiser_analysis_dict_for_split.pop("phenopacket") 4445 exomiser_analysis_analysis = os.path.join( 4446 tmp_dir, "analysis_analysis.json" 4447 ) 4448 with open(exomiser_analysis_analysis, "w") as fp: 4449 json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4) 4450 4451 ### INITAL VCF file ### 4452 ####################### 4453 4454 ### Create list of samples to use and include inti initial VCF file #### 4455 4456 # Subject (main sample) 4457 # Get sample ID in analysis dict 4458 sample_subject = ( 4459 param_exomiser_analysis_dict.get("phenopacket", {}) 4460 .get("subject", {}) 4461 .get("id", None) 4462 ) 4463 sample_proband = ( 4464 param_exomiser_analysis_dict.get("phenopacket", {}) 4465 .get("proband", {}) 4466 .get("subject", {}) 4467 .get("id", None) 4468 ) 4469 sample = [] 4470 if sample_subject: 4471 sample.append(sample_subject) 4472 if sample_proband: 4473 sample.append(sample_proband) 4474 4475 # Get sample ID within Pedigree 4476 pedigree_persons_list = ( 4477 param_exomiser_analysis_dict.get("phenopacket", {}) 4478 .get("pedigree", {}) 4479 .get("persons", {}) 4480 ) 4481 4482 # Create list with all sample ID in pedigree (if exists) 4483 pedigree_persons = [] 4484 for person in pedigree_persons_list: 4485 pedigree_persons.append(person.get("individualId")) 4486 4487 # Concat subject sample ID and samples ID in pedigreesamples 4488 samples = list(set(sample + pedigree_persons)) 4489 4490 # Check if sample list is not empty 4491 if not samples: 4492 log.error(f"No samples found") 4493 raise ValueError(f"No samples found") 4494 4495 # Create VCF with sample (either sample in param or first one by default) 4496 # Export VCF file 4497 self.export_variant_vcf( 4498 vcf_file=tmp_vcf_name, 4499 remove_info=True, 4500 add_samples=True, 4501 list_samples=samples, 4502 index=False, 4503 ) 4504 4505 ### Execute Exomiser ### 4506 ######################## 4507 4508 # Init command 4509 exomiser_command = "" 4510 4511 # Command exomiser options 4512 exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} " 4513 4514 # Release 4515 exomiser_release = param_exomiser.get("release", None) 4516 if exomiser_release: 4517 # phenotype data version 4518 exomiser_options += ( 4519 f" --exomiser.phenotype.data-version={exomiser_release} " 4520 ) 4521 # data version 4522 exomiser_options += ( 4523 f" --exomiser.{assembly}.data-version={exomiser_release} " 4524 ) 4525 # variant white list 4526 variant_white_list_file = ( 4527 f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz" 4528 ) 4529 if os.path.exists( 4530 os.path.join( 4531 databases_folders, assembly, variant_white_list_file 4532 ) 4533 ): 4534 exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} " 4535 4536 # transcript_source 4537 transcript_source = param_exomiser.get( 4538 "transcript_source", None 4539 ) # ucsc, refseq, ensembl 4540 if transcript_source: 4541 exomiser_options += ( 4542 f" --exomiser.{assembly}.transcript-source={transcript_source} " 4543 ) 4544 4545 # If analysis contain proband param 4546 if param_exomiser_analysis_dict.get("phenopacket", {}).get( 4547 "proband", {} 4548 ): 4549 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} " 4550 4551 # If no proband (usually uniq sample) 4552 else: 4553 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}" 4554 4555 # Log 4556 log.debug(f"exomiser_command_analysis={exomiser_command_analysis}") 4557 4558 # Run command 4559 result = subprocess.call( 4560 exomiser_command_analysis.split(), stdout=subprocess.PIPE 4561 ) 4562 if result: 4563 log.error("Exomiser command failed") 4564 raise ValueError("Exomiser command failed") 4565 4566 ### RESULTS ### 4567 ############### 4568 4569 ### Annotate with TSV fields ### 4570 4571 # Init result tsv file 4572 exomiser_to_info = param_exomiser.get("exomiser_to_info", False) 4573 4574 # Init result tsv file 4575 output_results_tsv = os.path.join(output_results, "howard.variants.tsv") 4576 4577 # Parse TSV file and explode columns in INFO field 4578 if exomiser_to_info and os.path.exists(output_results_tsv): 4579 4580 # Log 4581 log.debug("Exomiser columns to VCF INFO field") 4582 4583 # Retrieve columns and types 4584 query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """ 4585 output_results_tsv_df = self.get_query_to_df(query) 4586 output_results_tsv_columns = output_results_tsv_df.columns.tolist() 4587 4588 # Init concat fields for update 4589 sql_query_update_concat_fields = [] 4590 4591 # Fields to avoid 4592 fields_to_avoid = [ 4593 "CONTIG", 4594 "START", 4595 "END", 4596 "REF", 4597 "ALT", 4598 "QUAL", 4599 "FILTER", 4600 "GENOTYPE", 4601 ] 4602 4603 # List all columns to add into header 4604 for header_column in output_results_tsv_columns: 4605 4606 # If header column is enable 4607 if header_column not in fields_to_avoid: 4608 4609 # Header info type 4610 header_info_type = "String" 4611 header_column_df = output_results_tsv_df[header_column] 4612 header_column_df_dtype = header_column_df.dtype 4613 if header_column_df_dtype == object: 4614 if ( 4615 pd.to_numeric(header_column_df, errors="coerce") 4616 .notnull() 4617 .all() 4618 ): 4619 header_info_type = "Float" 4620 else: 4621 header_info_type = "Integer" 4622 4623 # Header info 4624 characters_to_validate = ["-"] 4625 pattern = "[" + "".join(characters_to_validate) + "]" 4626 header_info_name = re.sub( 4627 pattern, 4628 "_", 4629 f"Exomiser_{header_column}".replace("#", ""), 4630 ) 4631 header_info_number = "." 4632 header_info_description = ( 4633 f"Exomiser {header_column} annotation" 4634 ) 4635 header_info_source = "Exomiser" 4636 header_info_version = "unknown" 4637 header_info_code = CODE_TYPE_MAP[header_info_type] 4638 vcf_reader.infos[header_info_name] = vcf.parser._Info( 4639 header_info_name, 4640 header_info_number, 4641 header_info_type, 4642 header_info_description, 4643 header_info_source, 4644 header_info_version, 4645 header_info_code, 4646 ) 4647 4648 # Add field to add for update to concat fields 4649 sql_query_update_concat_fields.append( 4650 f""" 4651 CASE 4652 WHEN table_parquet."{header_column}" NOT IN ('','.') 4653 THEN concat( 4654 '{header_info_name}=', 4655 table_parquet."{header_column}", 4656 ';' 4657 ) 4658 4659 ELSE '' 4660 END 4661 """ 4662 ) 4663 4664 # Update query 4665 sql_query_update = f""" 4666 UPDATE {table_variants} as table_variants 4667 SET INFO = concat( 4668 CASE 4669 WHEN INFO NOT IN ('', '.') 4670 THEN INFO 4671 ELSE '' 4672 END, 4673 CASE 4674 WHEN table_variants.INFO NOT IN ('','.') 4675 THEN ';' 4676 ELSE '' 4677 END, 4678 ( 4679 SELECT 4680 concat( 4681 {",".join(sql_query_update_concat_fields)} 4682 ) 4683 FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet 4684 WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\" 4685 AND table_parquet.\"START\" = table_variants.\"POS\" 4686 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 4687 AND table_parquet.\"REF\" = table_variants.\"REF\" 4688 ) 4689 ) 4690 ; 4691 """ 4692 4693 # Update 4694 self.conn.execute(sql_query_update) 4695 4696 ### Annotate with VCF INFO field ### 4697 4698 # Init result VCF file 4699 output_results_vcf = os.path.join(output_results, "howard.vcf.gz") 4700 4701 # If VCF exists 4702 if os.path.exists(output_results_vcf): 4703 4704 # Log 4705 log.debug("Exomiser result VCF update variants") 4706 4707 # Find Exomiser INFO field annotation in header 4708 with gzip.open(output_results_vcf, "rt") as f: 4709 header_list = self.read_vcf_header(f) 4710 exomiser_vcf_header = vcf.Reader( 4711 io.StringIO("\n".join(header_list)) 4712 ) 4713 4714 # Add annotation INFO field to header 4715 vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"] 4716 4717 # Update variants with VCF 4718 self.update_from_vcf(output_results_vcf) 4719 4720 return True
This function annotate with Exomiser
This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
- "analysis" (dict/file): Full analysis dictionnary parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) Default : None
- "preset" (string): Analysis preset (available in config folder). Used if no full "analysis" is provided. Default: "exome"
- "phenopacket" (dict/file): Samples and phenotipic features parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. Default: None
- "subject" (dict): Sample parameters (see Exomiser docs). Example: "subject": { "id": "ISDBM322017", "sex": "FEMALE" } Default: None
- "sample" (string):
Sample name to construct "subject" section:
"subject":
{
"id": "
", "sex": "UNKNOWN_SEX" } Default: None - "phenotypicFeatures" (dict) Phenotypic features to construct "subject" section. Example: "phenotypicFeatures": [ { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, { "type": { "id": "HP:0000486", "label": "Strabismus" } } ]
- "hpo" (list) List of HPO ids as phenotypic features. Example: "hpo": ['0001156', '0001363', '0011304', '0010055'] Default: []
- "outputOptions" (dict): Output options (see Exomiser docs). Default: "output_options" = { "outputContributingVariantsOnly": False, "numGenes": 0, "outputFormats": ["TSV_VARIANT", "VCF"] }
- "transcript_source" (string): Transcript source (either "refseq", "ucsc", "ensembl") Default: "refseq"
- "exomiser_to_info" (boolean): Add exomiser TSV file columns as INFO fields in VCF. Default: False
- "release" (string): Exomise database release. If not exists, database release will be downloaded (take a while). Default: None (provided by application.properties configuration file)
- "exomiser_application_properties" (file): Exomiser configuration file (see Exomiser docs). Useful to automatically download databases (especially for specific genome databases).
Notes:
- If no sample in parameters, first sample in VCF will be chosen
- If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
Parameters
- threads: The number of threads to use
Returns
None.
4722 def annotation_snpeff(self, threads: int = None) -> None: 4723 """ 4724 This function annotate with snpEff 4725 4726 :param threads: The number of threads to use 4727 :return: the value of the variable "return_value". 4728 """ 4729 4730 # DEBUG 4731 log.debug("Start annotation with snpeff databases") 4732 4733 # Threads 4734 if not threads: 4735 threads = self.get_threads() 4736 log.debug("Threads: " + str(threads)) 4737 4738 # DEBUG 4739 delete_tmp = True 4740 if self.get_config().get("verbosity", "warning") in ["debug"]: 4741 delete_tmp = False 4742 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4743 4744 # Config 4745 config = self.get_config() 4746 log.debug("Config: " + str(config)) 4747 4748 # Config - Folders - Databases 4749 databases_folders = ( 4750 config.get("folders", {}).get("databases", {}).get("snpeff", ["."]) 4751 ) 4752 log.debug("Databases annotations: " + str(databases_folders)) 4753 4754 # # Config - Java 4755 # java_bin = get_bin( 4756 # tool="java", 4757 # bin="java", 4758 # bin_type="bin", 4759 # config=config, 4760 # default_folder="/usr/bin", 4761 # ) 4762 # if not (os.path.exists(java_bin) or (java_bin and which(java_bin))): 4763 # log.error(f"Annotation failed: no java bin '{java_bin}'") 4764 # raise ValueError(f"Annotation failed: no java bin '{java_bin}'") 4765 4766 # # Config - snpEff bin 4767 # snpeff_jar = get_bin( 4768 # tool="snpeff", 4769 # bin="snpEff.jar", 4770 # bin_type="jar", 4771 # config=config, 4772 # default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 4773 # ) 4774 # if not (os.path.exists(snpeff_jar) or (snpeff_jar and which(snpeff_jar))): 4775 # log.error(f"Annotation failed: no snpEff jar '{snpeff_jar}'") 4776 # raise ValueError(f"Annotation failed: no snpEff jar '{snpeff_jar}'") 4777 4778 # Config - snpEff bin command 4779 snpeff_bin_command = get_bin_command( 4780 bin="snpEff.jar", 4781 tool="snpeff", 4782 bin_type="jar", 4783 config=config, 4784 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 4785 ) 4786 if not snpeff_bin_command: 4787 msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'" 4788 log.error(msg_err) 4789 raise ValueError(msg_err) 4790 4791 # Config - snpEff databases 4792 snpeff_databases = ( 4793 config.get("folders", {}) 4794 .get("databases", {}) 4795 .get("snpeff", DEFAULT_SNPEFF_FOLDER) 4796 ) 4797 snpeff_databases = full_path(snpeff_databases) 4798 if snpeff_databases is not None and snpeff_databases != "": 4799 log.debug(f"Create snpEff databases folder") 4800 if not os.path.exists(snpeff_databases): 4801 os.makedirs(snpeff_databases) 4802 4803 # Param 4804 param = self.get_param() 4805 log.debug("Param: " + str(param)) 4806 4807 # Param 4808 options = param.get("annotation", {}).get("snpeff", {}).get("options", None) 4809 log.debug("Options: " + str(options)) 4810 4811 # Param - Assembly 4812 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4813 4814 # Param - Options 4815 snpeff_options = ( 4816 param.get("annotation", {}).get("snpeff", {}).get("options", "") 4817 ) 4818 snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None) 4819 snpeff_csvstats = ( 4820 param.get("annotation", {}).get("snpeff", {}).get("csvStats", None) 4821 ) 4822 if snpeff_stats: 4823 snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output()) 4824 snpeff_stats = full_path(snpeff_stats) 4825 snpeff_options += f" -stats {snpeff_stats}" 4826 if snpeff_csvstats: 4827 snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output()) 4828 snpeff_csvstats = full_path(snpeff_csvstats) 4829 snpeff_options += f" -csvStats {snpeff_csvstats}" 4830 4831 # Data 4832 table_variants = self.get_table_variants() 4833 4834 # Check if not empty 4835 log.debug("Check if not empty") 4836 sql_query_chromosomes = ( 4837 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4838 ) 4839 # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]: 4840 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4841 log.info(f"VCF empty") 4842 return 4843 4844 # Export in VCF 4845 log.debug("Create initial file to annotate") 4846 tmp_vcf = NamedTemporaryFile( 4847 prefix=self.get_prefix(), 4848 dir=self.get_tmp_dir(), 4849 suffix=".vcf.gz", 4850 delete=True, 4851 ) 4852 tmp_vcf_name = tmp_vcf.name 4853 4854 # VCF header 4855 vcf_reader = self.get_header() 4856 log.debug("Initial header: " + str(vcf_reader.infos)) 4857 4858 # Existing annotations 4859 for vcf_annotation in self.get_header().infos: 4860 4861 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 4862 log.debug( 4863 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 4864 ) 4865 4866 # Memory limit 4867 # if config.get("memory", None): 4868 # memory_limit = config.get("memory", "8G") 4869 # else: 4870 # memory_limit = "8G" 4871 memory_limit = self.get_memory("8G") 4872 log.debug(f"memory_limit: {memory_limit}") 4873 4874 # snpEff java options 4875 snpeff_java_options = ( 4876 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4877 ) 4878 log.debug(f"Exomiser java options: {snpeff_java_options}") 4879 4880 force_update_annotation = True 4881 4882 if "ANN" not in self.get_header().infos or force_update_annotation: 4883 4884 # Check snpEff database 4885 log.debug(f"Check snpEff databases {[assembly]}") 4886 databases_download_snpeff( 4887 folder=snpeff_databases, assemblies=[assembly], config=config 4888 ) 4889 4890 # Export VCF file 4891 self.export_variant_vcf( 4892 vcf_file=tmp_vcf_name, 4893 remove_info=True, 4894 add_samples=False, 4895 index=True, 4896 ) 4897 4898 # Tmp file 4899 err_files = [] 4900 tmp_annotate_vcf = NamedTemporaryFile( 4901 prefix=self.get_prefix(), 4902 dir=self.get_tmp_dir(), 4903 suffix=".vcf", 4904 delete=False, 4905 ) 4906 tmp_annotate_vcf_name = tmp_annotate_vcf.name 4907 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 4908 err_files.append(tmp_annotate_vcf_name_err) 4909 4910 # Command 4911 snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}" 4912 log.debug(f"Annotation - snpEff command: {snpeff_command}") 4913 run_parallel_commands([snpeff_command], 1) 4914 4915 # Error messages 4916 log.info(f"Error/Warning messages:") 4917 error_message_command_all = [] 4918 error_message_command_warning = [] 4919 error_message_command_err = [] 4920 for err_file in err_files: 4921 with open(err_file, "r") as f: 4922 for line in f: 4923 message = line.strip() 4924 error_message_command_all.append(message) 4925 if line.startswith("[W::"): 4926 error_message_command_warning.append(message) 4927 if line.startswith("[E::"): 4928 error_message_command_err.append(f"{err_file}: " + message) 4929 # log info 4930 for message in list( 4931 set(error_message_command_err + error_message_command_warning) 4932 ): 4933 log.info(f" {message}") 4934 # debug info 4935 for message in list(set(error_message_command_all)): 4936 log.debug(f" {message}") 4937 # failed 4938 if len(error_message_command_err): 4939 log.error("Annotation failed: Error in commands") 4940 raise ValueError("Annotation failed: Error in commands") 4941 4942 # Find annotation in header 4943 with open(tmp_annotate_vcf_name, "rt") as f: 4944 header_list = self.read_vcf_header(f) 4945 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 4946 4947 for ann in annovar_vcf_header.infos: 4948 if ann not in self.get_header().infos: 4949 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 4950 4951 # Update variants 4952 log.info(f"Annotation - Updating...") 4953 self.update_from_vcf(tmp_annotate_vcf_name) 4954 4955 else: 4956 if "ANN" in self.get_header().infos: 4957 log.debug(f"Existing snpEff annotations in VCF") 4958 if force_update_annotation: 4959 log.debug(f"Existing snpEff annotations in VCF - annotation forced")
This function annotate with snpEff
Parameters
- threads: The number of threads to use
Returns
the value of the variable "return_value".
4961 def annotation_annovar(self, threads: int = None) -> None: 4962 """ 4963 It takes a VCF file, annotates it with Annovar, and then updates the database with the new 4964 annotations 4965 4966 :param threads: number of threads to use 4967 :return: the value of the variable "return_value". 4968 """ 4969 4970 # DEBUG 4971 log.debug("Start annotation with Annovar databases") 4972 4973 # Threads 4974 if not threads: 4975 threads = self.get_threads() 4976 log.debug("Threads: " + str(threads)) 4977 4978 # Tmp en Err files 4979 tmp_files = [] 4980 err_files = [] 4981 4982 # DEBUG 4983 delete_tmp = True 4984 if self.get_config().get("verbosity", "warning") in ["debug"]: 4985 delete_tmp = False 4986 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4987 4988 # Config 4989 config = self.get_config() 4990 log.debug("Config: " + str(config)) 4991 4992 # Config - Folders - Databases 4993 databases_folders = ( 4994 config.get("folders", {}).get("databases", {}).get("annovar", ["."]) 4995 ) 4996 log.debug("Databases annotations: " + str(databases_folders)) 4997 4998 # Config - annovar bin command 4999 annovar_bin_command = get_bin_command( 5000 bin="table_annovar.pl", 5001 tool="annovar", 5002 bin_type="perl", 5003 config=config, 5004 default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar", 5005 ) 5006 if not annovar_bin_command: 5007 msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'" 5008 log.error(msg_err) 5009 raise ValueError(msg_err) 5010 5011 # Config - BCFTools bin command 5012 bcftools_bin_command = get_bin_command( 5013 bin="bcftools", 5014 tool="bcftools", 5015 bin_type="bin", 5016 config=config, 5017 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 5018 ) 5019 if not bcftools_bin_command: 5020 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 5021 log.error(msg_err) 5022 raise ValueError(msg_err) 5023 5024 # Config - annovar databases 5025 annovar_databases = ( 5026 config.get("folders", {}) 5027 .get("databases", {}) 5028 .get("annovar", DEFAULT_ANNOVAR_FOLDER) 5029 ) 5030 annovar_databases = full_path(annovar_databases) 5031 if annovar_databases != "" and not os.path.exists(annovar_databases): 5032 os.makedirs(annovar_databases) 5033 5034 # Param 5035 param = self.get_param() 5036 log.debug("Param: " + str(param)) 5037 5038 # Param - options 5039 options = param.get("annotation", {}).get("annovar", {}).get("options", {}) 5040 log.debug("Options: " + str(options)) 5041 5042 # Param - annotations 5043 annotations = ( 5044 param.get("annotation", {}).get("annovar", {}).get("annotations", {}) 5045 ) 5046 log.debug("Annotations: " + str(annotations)) 5047 5048 # Param - Assembly 5049 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5050 5051 # Annovar database assembly 5052 annovar_databases_assembly = f"{annovar_databases}/{assembly}" 5053 if annovar_databases_assembly != "" and not os.path.exists( 5054 annovar_databases_assembly 5055 ): 5056 os.makedirs(annovar_databases_assembly) 5057 5058 # Data 5059 table_variants = self.get_table_variants() 5060 5061 # Check if not empty 5062 log.debug("Check if not empty") 5063 sql_query_chromosomes = ( 5064 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5065 ) 5066 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 5067 if not sql_query_chromosomes_df["count"][0]: 5068 log.info(f"VCF empty") 5069 return 5070 5071 # VCF header 5072 vcf_reader = self.get_header() 5073 log.debug("Initial header: " + str(vcf_reader.infos)) 5074 5075 # Existing annotations 5076 for vcf_annotation in self.get_header().infos: 5077 5078 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5079 log.debug( 5080 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5081 ) 5082 5083 force_update_annotation = True 5084 5085 if annotations: 5086 5087 commands = [] 5088 tmp_annotates_vcf_name_list = [] 5089 5090 # Export in VCF 5091 log.debug("Create initial file to annotate") 5092 tmp_vcf = NamedTemporaryFile( 5093 prefix=self.get_prefix(), 5094 dir=self.get_tmp_dir(), 5095 suffix=".vcf.gz", 5096 delete=False, 5097 ) 5098 tmp_vcf_name = tmp_vcf.name 5099 tmp_files.append(tmp_vcf_name) 5100 tmp_files.append(tmp_vcf_name + ".tbi") 5101 5102 # Export VCF file 5103 self.export_variant_vcf( 5104 vcf_file=tmp_vcf_name, 5105 remove_info=".", 5106 add_samples=False, 5107 index=True, 5108 ) 5109 5110 # Create file for field rename 5111 log.debug("Create file for field rename") 5112 tmp_rename = NamedTemporaryFile( 5113 prefix=self.get_prefix(), 5114 dir=self.get_tmp_dir(), 5115 suffix=".rename", 5116 delete=False, 5117 ) 5118 tmp_rename_name = tmp_rename.name 5119 tmp_files.append(tmp_rename_name) 5120 5121 # Check Annovar database 5122 log.debug( 5123 f"Check Annovar databases {[assembly]}: {list(annotations.keys())}" 5124 ) 5125 databases_download_annovar( 5126 folder=annovar_databases, 5127 files=list(annotations.keys()), 5128 assemblies=[assembly], 5129 ) 5130 5131 for annotation in annotations: 5132 annotation_fields = annotations[annotation] 5133 5134 if not annotation_fields: 5135 annotation_fields = {"INFO": None} 5136 5137 log.info(f"Annotations Annovar - database '{annotation}'") 5138 log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}") 5139 5140 # Tmp file for annovar 5141 err_files = [] 5142 tmp_annotate_vcf_directory = TemporaryDirectory( 5143 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar" 5144 ) 5145 tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar" 5146 tmp_annotate_vcf_name_annovar = ( 5147 tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf" 5148 ) 5149 tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err" 5150 err_files.append(tmp_annotate_vcf_name_err) 5151 tmp_files.append(tmp_annotate_vcf_name_err) 5152 5153 # Tmp file final vcf annotated by annovar 5154 tmp_annotate_vcf = NamedTemporaryFile( 5155 prefix=self.get_prefix(), 5156 dir=self.get_tmp_dir(), 5157 suffix=".vcf.gz", 5158 delete=False, 5159 ) 5160 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5161 tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name) 5162 tmp_files.append(tmp_annotate_vcf_name) 5163 tmp_files.append(tmp_annotate_vcf_name + ".tbi") 5164 5165 # Number of fields 5166 annotation_list = [] 5167 annotation_renamed_list = [] 5168 5169 for annotation_field in annotation_fields: 5170 5171 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 5172 annotation_fields_new_name = annotation_fields.get( 5173 annotation_field, annotation_field 5174 ) 5175 if not annotation_fields_new_name: 5176 annotation_fields_new_name = annotation_field 5177 5178 if ( 5179 force_update_annotation 5180 or annotation_fields_new_name not in self.get_header().infos 5181 ): 5182 annotation_list.append(annotation_field) 5183 annotation_renamed_list.append(annotation_fields_new_name) 5184 else: # annotation_fields_new_name in self.get_header().infos and not force_update_annotation: 5185 log.warning( 5186 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 5187 ) 5188 5189 # Add rename info 5190 run_parallel_commands( 5191 [ 5192 f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}" 5193 ], 5194 1, 5195 ) 5196 5197 # log.debug("fields_to_removed: " + str(fields_to_removed)) 5198 log.debug("annotation_list: " + str(annotation_list)) 5199 5200 # protocol 5201 protocol = annotation 5202 5203 # argument 5204 argument = "" 5205 5206 # operation 5207 operation = "f" 5208 if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith( 5209 "ensGene" 5210 ): 5211 operation = "g" 5212 if options.get("genebase", None): 5213 argument = f"""'{options.get("genebase","")}'""" 5214 elif annotation in ["cytoBand"]: 5215 operation = "r" 5216 5217 # argument option 5218 argument_option = "" 5219 if argument != "": 5220 argument_option = " --argument " + argument 5221 5222 # command options 5223 command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """ # --intronhgvs 10 5224 for option in options: 5225 if option not in ["genebase"]: 5226 command_options += f""" --{option}={options[option]}""" 5227 5228 # Command 5229 5230 # Command - Annovar 5231 command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """ 5232 tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf") 5233 5234 # Command - start pipe 5235 command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """ 5236 5237 # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!) 5238 command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """ 5239 5240 # Command - Special characters (refGene annotation) 5241 command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """ 5242 5243 # Command - Clean empty fields (with value ".") 5244 command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """ 5245 5246 # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file 5247 annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"] 5248 if "ALL" not in annotation_list and "INFO" not in annotation_list: 5249 # for ann in annotation_renamed_list: 5250 for ann in annotation_list: 5251 annovar_fields_to_keep.append(f"^INFO/{ann}") 5252 5253 command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """ 5254 5255 # Command - indexing 5256 command_annovar += f""" && tabix {tmp_annotate_vcf_name} """ 5257 5258 log.debug(f"Annotation - Annovar command: {command_annovar}") 5259 run_parallel_commands([command_annovar], 1) 5260 5261 # Error messages 5262 log.info(f"Error/Warning messages:") 5263 error_message_command_all = [] 5264 error_message_command_warning = [] 5265 error_message_command_err = [] 5266 for err_file in err_files: 5267 with open(err_file, "r") as f: 5268 for line in f: 5269 message = line.strip() 5270 error_message_command_all.append(message) 5271 if line.startswith("[W::") or line.startswith("WARNING"): 5272 error_message_command_warning.append(message) 5273 if line.startswith("[E::") or line.startswith("ERROR"): 5274 error_message_command_err.append( 5275 f"{err_file}: " + message 5276 ) 5277 # log info 5278 for message in list( 5279 set(error_message_command_err + error_message_command_warning) 5280 ): 5281 log.info(f" {message}") 5282 # debug info 5283 for message in list(set(error_message_command_all)): 5284 log.debug(f" {message}") 5285 # failed 5286 if len(error_message_command_err): 5287 log.error("Annotation failed: Error in commands") 5288 raise ValueError("Annotation failed: Error in commands") 5289 5290 if tmp_annotates_vcf_name_list: 5291 5292 # List of annotated files 5293 tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list) 5294 5295 # Tmp file 5296 tmp_annotate_vcf = NamedTemporaryFile( 5297 prefix=self.get_prefix(), 5298 dir=self.get_tmp_dir(), 5299 suffix=".vcf.gz", 5300 delete=False, 5301 ) 5302 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5303 tmp_files.append(tmp_annotate_vcf_name) 5304 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5305 err_files.append(tmp_annotate_vcf_name_err) 5306 tmp_files.append(tmp_annotate_vcf_name_err) 5307 5308 # Command merge 5309 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} " 5310 log.info( 5311 f"Annotation Annovar - Annotation merging " 5312 + str(len(tmp_annotates_vcf_name_list)) 5313 + " annotated files" 5314 ) 5315 log.debug(f"Annotation - merge command: {merge_command}") 5316 run_parallel_commands([merge_command], 1) 5317 5318 # Find annotation in header 5319 with bgzf.open(tmp_annotate_vcf_name, "rt") as f: 5320 header_list = self.read_vcf_header(f) 5321 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5322 5323 for ann in annovar_vcf_header.infos: 5324 if ann not in self.get_header().infos: 5325 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5326 5327 # Update variants 5328 log.info(f"Annotation Annovar - Updating...") 5329 self.update_from_vcf(tmp_annotate_vcf_name) 5330 5331 # Clean files 5332 # Tmp file remove command 5333 if True: 5334 tmp_files_remove_command = "" 5335 if tmp_files: 5336 tmp_files_remove_command = " ".join(tmp_files) 5337 clean_command = f" rm -f {tmp_files_remove_command} " 5338 log.debug(f"Annotation Annovar - Annotation cleaning ") 5339 log.debug(f"Annotation - cleaning command: {clean_command}") 5340 run_parallel_commands([clean_command], 1)
It takes a VCF file, annotates it with Annovar, and then updates the database with the new annotations
Parameters
- threads: number of threads to use
Returns
the value of the variable "return_value".
5343 def annotation_parquet(self, threads: int = None) -> None: 5344 """ 5345 It takes a VCF file, and annotates it with a parquet file 5346 5347 :param threads: number of threads to use for the annotation 5348 :return: the value of the variable "result". 5349 """ 5350 5351 # DEBUG 5352 log.debug("Start annotation with parquet databases") 5353 5354 # Threads 5355 if not threads: 5356 threads = self.get_threads() 5357 log.debug("Threads: " + str(threads)) 5358 5359 # DEBUG 5360 delete_tmp = True 5361 if self.get_config().get("verbosity", "warning") in ["debug"]: 5362 delete_tmp = False 5363 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5364 5365 # Config 5366 databases_folders = set( 5367 self.get_config() 5368 .get("folders", {}) 5369 .get("databases", {}) 5370 .get("annotations", ["."]) 5371 + self.get_config() 5372 .get("folders", {}) 5373 .get("databases", {}) 5374 .get("parquet", ["."]) 5375 ) 5376 log.debug("Databases annotations: " + str(databases_folders)) 5377 5378 # Param 5379 annotations = ( 5380 self.get_param() 5381 .get("annotation", {}) 5382 .get("parquet", {}) 5383 .get("annotations", None) 5384 ) 5385 log.debug("Annotations: " + str(annotations)) 5386 5387 # Assembly 5388 assembly = self.get_param().get( 5389 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 5390 ) 5391 5392 # Force Update Annotation 5393 force_update_annotation = ( 5394 self.get_param() 5395 .get("annotation", {}) 5396 .get("options", {}) 5397 .get("annotations_update", False) 5398 ) 5399 log.debug(f"force_update_annotation={force_update_annotation}") 5400 force_append_annotation = ( 5401 self.get_param() 5402 .get("annotation", {}) 5403 .get("options", {}) 5404 .get("annotations_append", False) 5405 ) 5406 log.debug(f"force_append_annotation={force_append_annotation}") 5407 5408 # Data 5409 table_variants = self.get_table_variants() 5410 5411 # Check if not empty 5412 log.debug("Check if not empty") 5413 sql_query_chromosomes_df = self.get_query_to_df( 5414 f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1""" 5415 ) 5416 if not sql_query_chromosomes_df["count"][0]: 5417 log.info(f"VCF empty") 5418 return 5419 5420 # VCF header 5421 vcf_reader = self.get_header() 5422 log.debug("Initial header: " + str(vcf_reader.infos)) 5423 5424 # Nb Variants POS 5425 log.debug("NB Variants Start") 5426 nb_variants = self.conn.execute( 5427 f"SELECT count(*) AS count FROM variants" 5428 ).fetchdf()["count"][0] 5429 log.debug("NB Variants Stop") 5430 5431 # Existing annotations 5432 for vcf_annotation in self.get_header().infos: 5433 5434 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5435 log.debug( 5436 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5437 ) 5438 5439 # Added columns 5440 added_columns = [] 5441 5442 # drop indexes 5443 log.debug(f"Drop indexes...") 5444 self.drop_indexes() 5445 5446 if annotations: 5447 5448 if "ALL" in annotations: 5449 5450 all_param = annotations.get("ALL", {}) 5451 all_param_formats = all_param.get("formats", None) 5452 all_param_releases = all_param.get("releases", None) 5453 5454 databases_infos_dict = self.scan_databases( 5455 database_formats=all_param_formats, 5456 database_releases=all_param_releases, 5457 ) 5458 for database_infos in databases_infos_dict.keys(): 5459 if database_infos not in annotations: 5460 annotations[database_infos] = {"INFO": None} 5461 5462 for annotation in annotations: 5463 5464 if annotation in ["ALL"]: 5465 continue 5466 5467 # Annotation Name 5468 annotation_name = os.path.basename(annotation) 5469 5470 # Annotation fields 5471 annotation_fields = annotations[annotation] 5472 if not annotation_fields: 5473 annotation_fields = {"INFO": None} 5474 5475 log.debug(f"Annotation '{annotation_name}'") 5476 log.debug( 5477 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 5478 ) 5479 5480 # Create Database 5481 database = Database( 5482 database=annotation, 5483 databases_folders=databases_folders, 5484 assembly=assembly, 5485 ) 5486 5487 # Find files 5488 parquet_file = database.get_database() 5489 parquet_hdr_file = database.get_header_file() 5490 parquet_type = database.get_type() 5491 5492 # Check if files exists 5493 if not parquet_file or not parquet_hdr_file: 5494 log.error("Annotation failed: file not found") 5495 raise ValueError("Annotation failed: file not found") 5496 else: 5497 # Get parquet connexion 5498 parquet_sql_attach = database.get_sql_database_attach( 5499 output="query" 5500 ) 5501 if parquet_sql_attach: 5502 self.conn.execute(parquet_sql_attach) 5503 parquet_file_link = database.get_sql_database_link() 5504 # Log 5505 log.debug( 5506 f"Annotation '{annotation_name}' - file: " 5507 + str(parquet_file) 5508 + " and " 5509 + str(parquet_hdr_file) 5510 ) 5511 5512 # Database full header columns 5513 parquet_hdr_vcf_header_columns = database.get_header_file_columns( 5514 parquet_hdr_file 5515 ) 5516 # Log 5517 log.debug( 5518 "Annotation database header columns : " 5519 + str(parquet_hdr_vcf_header_columns) 5520 ) 5521 5522 # Load header as VCF object 5523 parquet_hdr_vcf_header_infos = database.get_header().infos 5524 # Log 5525 log.debug( 5526 "Annotation database header: " 5527 + str(parquet_hdr_vcf_header_infos) 5528 ) 5529 5530 # Get extra infos 5531 parquet_columns = database.get_extra_columns() 5532 # Log 5533 log.debug("Annotation database Columns: " + str(parquet_columns)) 5534 5535 # Add extra columns if "ALL" in annotation_fields 5536 # if "ALL" in annotation_fields: 5537 # allow_add_extra_column = True 5538 if "ALL" in annotation_fields and database.get_extra_columns(): 5539 for extra_column in database.get_extra_columns(): 5540 if ( 5541 extra_column not in annotation_fields 5542 and extra_column.replace("INFO/", "") 5543 not in parquet_hdr_vcf_header_infos 5544 ): 5545 parquet_hdr_vcf_header_infos[extra_column] = ( 5546 vcf.parser._Info( 5547 extra_column, 5548 ".", 5549 "String", 5550 f"{extra_column} description", 5551 "unknown", 5552 "unknown", 5553 self.code_type_map["String"], 5554 ) 5555 ) 5556 5557 # For all fields in database 5558 annotation_fields_all = False 5559 if "ALL" in annotation_fields or "INFO" in annotation_fields: 5560 annotation_fields_all = True 5561 annotation_fields = { 5562 key: key for key in parquet_hdr_vcf_header_infos 5563 } 5564 5565 log.debug( 5566 "Annotation database header - All annotations added: " 5567 + str(annotation_fields) 5568 ) 5569 5570 # Init 5571 5572 # List of annotation fields to use 5573 sql_query_annotation_update_info_sets = [] 5574 5575 # List of annotation to agregate 5576 sql_query_annotation_to_agregate = [] 5577 5578 # Number of fields 5579 nb_annotation_field = 0 5580 5581 # Annotation fields processed 5582 annotation_fields_processed = [] 5583 5584 # Columns mapping 5585 map_columns = database.map_columns( 5586 columns=annotation_fields, prefixes=["INFO/"] 5587 ) 5588 5589 # Query dict for fields to remove (update option) 5590 query_dict_remove = {} 5591 5592 # Fetch Anotation fields 5593 for annotation_field in annotation_fields: 5594 5595 # annotation_field_column 5596 annotation_field_column = map_columns.get( 5597 annotation_field, "INFO" 5598 ) 5599 5600 # field new name, if parametered 5601 annotation_fields_new_name = annotation_fields.get( 5602 annotation_field, annotation_field 5603 ) 5604 if not annotation_fields_new_name: 5605 annotation_fields_new_name = annotation_field 5606 5607 # To annotate 5608 # force_update_annotation = True 5609 # force_append_annotation = True 5610 # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)): 5611 if annotation_field in parquet_hdr_vcf_header_infos and ( 5612 force_update_annotation 5613 or force_append_annotation 5614 or ( 5615 annotation_fields_new_name 5616 not in self.get_header().infos 5617 ) 5618 ): 5619 5620 # Add field to annotation to process list 5621 annotation_fields_processed.append( 5622 annotation_fields_new_name 5623 ) 5624 5625 # explode infos for the field 5626 annotation_fields_new_name_info_msg = "" 5627 if ( 5628 force_update_annotation 5629 and annotation_fields_new_name 5630 in self.get_header().infos 5631 ): 5632 # Remove field from INFO 5633 query = f""" 5634 UPDATE {table_variants} as table_variants 5635 SET INFO = REGEXP_REPLACE( 5636 concat(table_variants.INFO,''), 5637 ';*{annotation_fields_new_name}=[^;]*', 5638 '' 5639 ) 5640 WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%' 5641 """ 5642 annotation_fields_new_name_info_msg = " [update]" 5643 query_dict_remove[ 5644 f"remove 'INFO/{annotation_fields_new_name}'" 5645 ] = query 5646 5647 # Sep between fields in INFO 5648 nb_annotation_field += 1 5649 if nb_annotation_field > 1: 5650 annotation_field_sep = ";" 5651 else: 5652 annotation_field_sep = "" 5653 5654 log.info( 5655 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}" 5656 ) 5657 5658 # Add INFO field to header 5659 parquet_hdr_vcf_header_infos_number = ( 5660 parquet_hdr_vcf_header_infos[annotation_field].num 5661 or "." 5662 ) 5663 parquet_hdr_vcf_header_infos_type = ( 5664 parquet_hdr_vcf_header_infos[annotation_field].type 5665 or "String" 5666 ) 5667 parquet_hdr_vcf_header_infos_description = ( 5668 parquet_hdr_vcf_header_infos[annotation_field].desc 5669 or f"{annotation_field} description" 5670 ) 5671 parquet_hdr_vcf_header_infos_source = ( 5672 parquet_hdr_vcf_header_infos[annotation_field].source 5673 or "unknown" 5674 ) 5675 parquet_hdr_vcf_header_infos_version = ( 5676 parquet_hdr_vcf_header_infos[annotation_field].version 5677 or "unknown" 5678 ) 5679 5680 vcf_reader.infos[annotation_fields_new_name] = ( 5681 vcf.parser._Info( 5682 annotation_fields_new_name, 5683 parquet_hdr_vcf_header_infos_number, 5684 parquet_hdr_vcf_header_infos_type, 5685 parquet_hdr_vcf_header_infos_description, 5686 parquet_hdr_vcf_header_infos_source, 5687 parquet_hdr_vcf_header_infos_version, 5688 self.code_type_map[ 5689 parquet_hdr_vcf_header_infos_type 5690 ], 5691 ) 5692 ) 5693 5694 # Append 5695 if force_append_annotation: 5696 query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """ 5697 else: 5698 query_case_when_append = "" 5699 5700 # Annotation/Update query fields 5701 # Found in INFO column 5702 if ( 5703 annotation_field_column == "INFO" 5704 and "INFO" in parquet_hdr_vcf_header_columns 5705 ): 5706 sql_query_annotation_update_info_sets.append( 5707 f""" 5708 CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append} 5709 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1)) 5710 ELSE '' 5711 END 5712 """ 5713 ) 5714 # Found in a specific column 5715 else: 5716 sql_query_annotation_update_info_sets.append( 5717 f""" 5718 CASE WHEN table_parquet."{annotation_field_column}" NOT IN ('','.') {query_case_when_append} 5719 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(table_parquet."{annotation_field_column}", ';', ',')) 5720 ELSE '' 5721 END 5722 """ 5723 ) 5724 sql_query_annotation_to_agregate.append( 5725 f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ 5726 ) 5727 5728 # Not to annotate 5729 else: 5730 5731 if force_update_annotation: 5732 annotation_message = "forced" 5733 else: 5734 annotation_message = "skipped" 5735 5736 if annotation_field not in parquet_hdr_vcf_header_infos: 5737 log.warning( 5738 f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file" 5739 ) 5740 if annotation_fields_new_name in self.get_header().infos: 5741 log.warning( 5742 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})" 5743 ) 5744 5745 # Check if ALL fields have to be annotated. Thus concat all INFO field 5746 # allow_annotation_full_info = True 5747 allow_annotation_full_info = not force_append_annotation 5748 5749 if parquet_type in ["regions"]: 5750 allow_annotation_full_info = False 5751 5752 if ( 5753 allow_annotation_full_info 5754 and nb_annotation_field == len(annotation_fields) 5755 and annotation_fields_all 5756 and ( 5757 "INFO" in parquet_hdr_vcf_header_columns 5758 and "INFO" in database.get_extra_columns() 5759 ) 5760 ): 5761 log.debug("Column INFO annotation enabled") 5762 sql_query_annotation_update_info_sets = [] 5763 sql_query_annotation_update_info_sets.append( 5764 f" table_parquet.INFO " 5765 ) 5766 5767 if sql_query_annotation_update_info_sets: 5768 5769 # Annotate 5770 log.info(f"Annotation '{annotation_name}' - Annotation...") 5771 5772 # Join query annotation update info sets for SQL 5773 sql_query_annotation_update_info_sets_sql = ",".join( 5774 sql_query_annotation_update_info_sets 5775 ) 5776 5777 # Check chromosomes list (and variants infos) 5778 sql_query_chromosomes = f""" 5779 SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants 5780 FROM {table_variants} as table_variants 5781 GROUP BY table_variants."#CHROM" 5782 ORDER BY table_variants."#CHROM" 5783 """ 5784 sql_query_chromosomes_df = self.conn.execute( 5785 sql_query_chromosomes 5786 ).df() 5787 sql_query_chromosomes_dict = { 5788 entry["CHROM"]: { 5789 "count": entry["count_variants"], 5790 "min": entry["min_variants"], 5791 "max": entry["max_variants"], 5792 } 5793 for index, entry in sql_query_chromosomes_df.iterrows() 5794 } 5795 5796 # Init 5797 nb_of_query = 0 5798 nb_of_variant_annotated = 0 5799 query_dict = query_dict_remove 5800 5801 # for chrom in sql_query_chromosomes_df["CHROM"]: 5802 for chrom in sql_query_chromosomes_dict: 5803 5804 # Number of variant by chromosome 5805 nb_of_variant_by_chrom = sql_query_chromosomes_dict.get( 5806 chrom, {} 5807 ).get("count", 0) 5808 5809 log.debug( 5810 f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..." 5811 ) 5812 5813 # Annotation with regions database 5814 if parquet_type in ["regions"]: 5815 sql_query_annotation_from_clause = f""" 5816 FROM ( 5817 SELECT 5818 '{chrom}' AS \"#CHROM\", 5819 table_variants_from.\"POS\" AS \"POS\", 5820 {",".join(sql_query_annotation_to_agregate)} 5821 FROM {table_variants} as table_variants_from 5822 LEFT JOIN {parquet_file_link} as table_parquet_from ON ( 5823 table_parquet_from."#CHROM" = '{chrom}' 5824 AND table_variants_from.\"POS\" <= table_parquet_from.\"END\" 5825 AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1) 5826 OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1) 5827 ) 5828 ) 5829 WHERE table_variants_from.\"#CHROM\" in ('{chrom}') 5830 GROUP BY table_variants_from.\"POS\" 5831 ) 5832 as table_parquet 5833 """ 5834 5835 sql_query_annotation_where_clause = """ 5836 table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 5837 AND table_parquet.\"POS\" = table_variants.\"POS\" 5838 """ 5839 5840 # Annotation with variants database 5841 else: 5842 sql_query_annotation_from_clause = f""" 5843 FROM {parquet_file_link} as table_parquet 5844 """ 5845 sql_query_annotation_where_clause = f""" 5846 table_variants."#CHROM" = '{chrom}' 5847 AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 5848 AND table_parquet.\"POS\" = table_variants.\"POS\" 5849 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 5850 AND table_parquet.\"REF\" = table_variants.\"REF\" 5851 """ 5852 5853 # Create update query 5854 sql_query_annotation_chrom_interval_pos = f""" 5855 UPDATE {table_variants} as table_variants 5856 SET INFO = 5857 concat( 5858 CASE WHEN table_variants.INFO NOT IN ('','.') 5859 THEN table_variants.INFO 5860 ELSE '' 5861 END 5862 , 5863 CASE WHEN table_variants.INFO NOT IN ('','.') 5864 AND ( 5865 concat({sql_query_annotation_update_info_sets_sql}) 5866 ) 5867 NOT IN ('','.') 5868 THEN ';' 5869 ELSE '' 5870 END 5871 , 5872 {sql_query_annotation_update_info_sets_sql} 5873 ) 5874 {sql_query_annotation_from_clause} 5875 WHERE {sql_query_annotation_where_clause} 5876 ; 5877 """ 5878 5879 # Add update query to dict 5880 query_dict[ 5881 f"{chrom} [{nb_of_variant_by_chrom} variants]" 5882 ] = sql_query_annotation_chrom_interval_pos 5883 5884 nb_of_query = len(query_dict) 5885 num_query = 0 5886 5887 # SET max_expression_depth TO x 5888 self.conn.execute("SET max_expression_depth TO 10000") 5889 5890 for query_name in query_dict: 5891 query = query_dict[query_name] 5892 num_query += 1 5893 log.info( 5894 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..." 5895 ) 5896 result = self.conn.execute(query) 5897 nb_of_variant_annotated_by_query = result.df()["Count"][0] 5898 nb_of_variant_annotated += nb_of_variant_annotated_by_query 5899 log.info( 5900 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated" 5901 ) 5902 5903 log.info( 5904 f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)" 5905 ) 5906 5907 else: 5908 5909 log.info( 5910 f"Annotation '{annotation_name}' - No Annotations available" 5911 ) 5912 5913 log.debug("Final header: " + str(vcf_reader.infos)) 5914 5915 # Remove added columns 5916 for added_column in added_columns: 5917 self.drop_column(column=added_column)
It takes a VCF file, and annotates it with a parquet file
Parameters
- threads: number of threads to use for the annotation
Returns
the value of the variable "result".
5919 def annotation_splice(self, threads: int = None) -> None: 5920 """ 5921 This function annotate with snpEff 5922 5923 :param threads: The number of threads to use 5924 :return: the value of the variable "return_value". 5925 """ 5926 5927 # DEBUG 5928 log.debug("Start annotation with splice tools") 5929 5930 # Threads 5931 if not threads: 5932 threads = self.get_threads() 5933 log.debug("Threads: " + str(threads)) 5934 5935 # DEBUG 5936 delete_tmp = True 5937 if self.get_config().get("verbosity", "warning") in ["debug"]: 5938 delete_tmp = False 5939 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5940 5941 # Config 5942 config = self.get_config() 5943 log.debug("Config: " + str(config)) 5944 splice_config = config.get("tools", {}).get("splice", {}) 5945 if not splice_config: 5946 splice_config = DEFAULT_TOOLS_BIN.get("splice", {}) 5947 if not splice_config: 5948 msg_err = "No Splice tool config" 5949 log.error(msg_err) 5950 raise ValueError(msg_err) 5951 log.debug(f"splice_config={splice_config}") 5952 5953 # Config - Folders - Databases 5954 databases_folders = ( 5955 config.get("folders", {}).get("databases", {}).get("splice", ["."]) 5956 ) 5957 log.debug("Databases annotations: " + str(databases_folders)) 5958 5959 # Splice docker image 5960 splice_docker_image = splice_config.get("docker").get("image") 5961 5962 # Pull splice image if it's not already there 5963 if not check_docker_image_exists(splice_docker_image): 5964 log.warning( 5965 f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub" 5966 ) 5967 try: 5968 command(f"docker pull {splice_config.get('docker').get('image')}") 5969 except subprocess.CalledProcessError: 5970 msg_err = f"Unable to find docker {splice_docker_image} on dockerhub" 5971 log.error(msg_err) 5972 raise ValueError(msg_err) 5973 return None 5974 5975 # Config - splice databases 5976 splice_databases = ( 5977 config.get("folders", {}) 5978 .get("databases", {}) 5979 .get("splice", DEFAULT_SPLICE_FOLDER) 5980 ) 5981 splice_databases = full_path(splice_databases) 5982 5983 # Param 5984 param = self.get_param() 5985 log.debug("Param: " + str(param)) 5986 5987 # Param 5988 options = param.get("annotation", {}).get("splice", {}) 5989 log.debug("Options: " + str(options)) 5990 5991 # Data 5992 table_variants = self.get_table_variants() 5993 5994 # Check if not empty 5995 log.debug("Check if not empty") 5996 sql_query_chromosomes = ( 5997 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5998 ) 5999 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 6000 log.info("VCF empty") 6001 return None 6002 6003 # Export in VCF 6004 log.debug("Create initial file to annotate") 6005 6006 # Create output folder 6007 output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}") 6008 if not os.path.exists(output_folder): 6009 Path(output_folder).mkdir(parents=True, exist_ok=True) 6010 6011 # Create tmp VCF file 6012 tmp_vcf = NamedTemporaryFile( 6013 prefix=self.get_prefix(), 6014 dir=output_folder, 6015 suffix=".vcf", 6016 delete=False, 6017 ) 6018 tmp_vcf_name = tmp_vcf.name 6019 6020 # VCF header 6021 header = self.get_header() 6022 6023 # Existing annotations 6024 for vcf_annotation in self.get_header().infos: 6025 6026 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 6027 log.debug( 6028 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 6029 ) 6030 6031 # Memory limit 6032 if config.get("memory", None): 6033 memory_limit = config.get("memory", "8G").upper() 6034 # upper() 6035 else: 6036 memory_limit = "8G" 6037 log.debug(f"memory_limit: {memory_limit}") 6038 6039 # Check number of variants to annotate 6040 where_clause_regex_spliceai = r"SpliceAI_\w+" 6041 where_clause_regex_spip = r"SPiP_\w+" 6042 where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')""" 6043 df_list_of_variants_to_annotate = self.get_query_to_df( 6044 query=f""" SELECT * FROM variants {where_clause} """ 6045 ) 6046 if len(df_list_of_variants_to_annotate) == 0: 6047 log.warning( 6048 f"No variants to annotate with splice. Variants probably already annotated with splice" 6049 ) 6050 return None 6051 else: 6052 log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants") 6053 6054 # Export VCF file 6055 self.export_variant_vcf( 6056 vcf_file=tmp_vcf_name, 6057 remove_info=True, 6058 add_samples=True, 6059 index=False, 6060 where_clause=where_clause, 6061 ) 6062 6063 # Create docker container and launch splice analysis 6064 if splice_config: 6065 6066 # Splice mount folders 6067 mount_folders = splice_config.get("mount", {}) 6068 6069 # Genome mount 6070 mount_folders[ 6071 config.get("folders", {}) 6072 .get("databases", {}) 6073 .get("genomes", DEFAULT_GENOME_FOLDER) 6074 ] = "ro" 6075 6076 # SpliceAI mount 6077 mount_folders[ 6078 config.get("folders", {}) 6079 .get("databases", {}) 6080 .get("spliceai", DEFAULT_SPLICEAI_FOLDER) 6081 ] = "ro" 6082 6083 # Genome mount 6084 mount_folders[ 6085 config.get("folders", {}) 6086 .get("databases", {}) 6087 .get("spip", DEFAULT_SPIP_FOLDER) 6088 ] = "ro" 6089 6090 # Mount folders 6091 mount = [] 6092 6093 # Config mount 6094 mount = [ 6095 f"-v {full_path(path)}:{full_path(path)}:{mode}" 6096 for path, mode in mount_folders.items() 6097 ] 6098 6099 if any(value for value in splice_config.values() if value is None): 6100 log.warning("At least one splice config parameter is empty") 6101 return None 6102 6103 # Params in splice nf 6104 def check_values(dico: dict): 6105 """ 6106 Ensure parameters for NF splice pipeline 6107 """ 6108 for key, val in dico.items(): 6109 if key == "genome": 6110 if any( 6111 assemb in options.get("genome", {}) 6112 for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"] 6113 ): 6114 yield f"--{key} hg19" 6115 elif any( 6116 assemb in options.get("genome", {}) 6117 for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"] 6118 ): 6119 yield f"--{key} hg38" 6120 elif ( 6121 (isinstance(val, str) and val) 6122 or isinstance(val, int) 6123 or isinstance(val, bool) 6124 ): 6125 yield f"--{key} {val}" 6126 6127 # Genome 6128 genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY)) 6129 options["genome"] = genome 6130 6131 # NF params 6132 nf_params = [] 6133 6134 # Add options 6135 if options: 6136 nf_params = list(check_values(options)) 6137 log.debug(f"Splice NF params: {' '.join(nf_params)}") 6138 else: 6139 log.debug("No NF params provided") 6140 6141 # Add threads 6142 if "threads" not in options.keys(): 6143 nf_params.append(f"--threads {threads}") 6144 6145 # Genome path 6146 genome_path = find_genome( 6147 config.get("folders", {}) 6148 .get("databases", {}) 6149 .get("genomes", DEFAULT_GENOME_FOLDER), 6150 file=f"{genome}.fa", 6151 ) 6152 # Add genome path 6153 if not genome_path: 6154 raise ValueError( 6155 f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}" 6156 ) 6157 else: 6158 log.debug(f"Genome: {genome_path}") 6159 nf_params.append(f"--genome_path {genome_path}") 6160 6161 def splice_annotations(options: dict = {}, config: dict = {}) -> list: 6162 """ 6163 Setting up updated databases for SPiP and SpliceAI 6164 """ 6165 6166 try: 6167 6168 # SpliceAI assembly transcriptome 6169 spliceai_assembly = os.path.join( 6170 config.get("folders", {}) 6171 .get("databases", {}) 6172 .get("spliceai", {}), 6173 options.get("genome"), 6174 "transcriptome", 6175 ) 6176 spip_assembly = options.get("genome") 6177 6178 spip = find( 6179 f"transcriptome_{spip_assembly}.RData", 6180 config.get("folders", {}).get("databases", {}).get("spip", {}), 6181 ) 6182 spliceai = find("spliceai.refseq.txt", spliceai_assembly) 6183 log.debug(f"SPiP annotations: {spip}") 6184 log.debug(f"SpliceAI annotations: {spliceai}") 6185 if spip and spliceai: 6186 return [ 6187 f"--spip_transcriptome {spip}", 6188 f"--spliceai_annotations {spliceai}", 6189 ] 6190 else: 6191 # TODO crash and go on with basic annotations ? 6192 # raise ValueError( 6193 # "Can't find splice databases in configuration EXIT" 6194 # ) 6195 log.warning( 6196 "Can't find splice databases in configuration, use annotations file from image" 6197 ) 6198 except TypeError: 6199 log.warning( 6200 "Can't find splice databases in configuration, use annotations file from image" 6201 ) 6202 return [] 6203 6204 # Add options, check if transcriptome option have already beend provided 6205 if ( 6206 "spip_transcriptome" not in nf_params 6207 and "spliceai_transcriptome" not in nf_params 6208 ): 6209 splice_reference = splice_annotations(options, config) 6210 if splice_reference: 6211 nf_params.extend(splice_reference) 6212 6213 nf_params.append(f"--output_folder {output_folder}") 6214 6215 random_uuid = f"HOWARD-SPLICE-{get_random()}" 6216 cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline" 6217 log.debug(cmd) 6218 6219 splice_config["docker"]["command"] = cmd 6220 6221 docker_cmd = get_bin_command( 6222 tool="splice", 6223 bin_type="docker", 6224 config=config, 6225 default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker", 6226 add_options=f"--name {random_uuid} {' '.join(mount)}", 6227 ) 6228 6229 # Docker debug 6230 # if splice_config.get("rm_container"): 6231 # rm_container = "--rm" 6232 # else: 6233 # rm_container = "" 6234 # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}" 6235 6236 log.debug(docker_cmd) 6237 res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True) 6238 log.debug(res.stdout) 6239 if res.stderr: 6240 log.error(res.stderr) 6241 res.check_returncode() 6242 else: 6243 log.warning(f"Splice tool configuration not found: {config}") 6244 6245 # Update variants 6246 log.info("Annotation - Updating...") 6247 # Test find output vcf 6248 log.debug( 6249 f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6250 ) 6251 output_vcf = [] 6252 # Wrong folder to look in 6253 for files in os.listdir(os.path.dirname(tmp_vcf_name)): 6254 if ( 6255 files 6256 == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6257 ): 6258 output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files)) 6259 # log.debug(os.listdir(options.get("output_folder"))) 6260 log.debug(f"Splice annotated vcf: {output_vcf[0]}") 6261 if not output_vcf: 6262 log.debug( 6263 f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz" 6264 ) 6265 else: 6266 # Get new header from annotated vcf 6267 log.debug(f"Initial header: {len(header.infos)} fields") 6268 # Create new header with splice infos 6269 new_vcf = Variants(input=output_vcf[0]) 6270 new_vcf_header = new_vcf.get_header().infos 6271 for keys, infos in new_vcf_header.items(): 6272 if keys not in header.infos.keys(): 6273 header.infos[keys] = infos 6274 log.debug(f"New header: {len(header.infos)} fields") 6275 log.debug(f"Splice tmp output: {output_vcf[0]}") 6276 self.update_from_vcf(output_vcf[0]) 6277 6278 # Remove folder 6279 remove_if_exists(output_folder)
This function annotate with snpEff
Parameters
- threads: The number of threads to use
Returns
the value of the variable "return_value".
6285 def get_config_default(self, name: str) -> dict: 6286 """ 6287 The function `get_config_default` returns a dictionary containing default configurations for 6288 various calculations and prioritizations. 6289 6290 :param name: The `get_config_default` function returns a dictionary containing default 6291 configurations for different calculations and prioritizations. The `name` parameter is used to 6292 specify which specific configuration to retrieve from the dictionary 6293 :type name: str 6294 :return: The function `get_config_default` returns a dictionary containing default configuration 6295 settings for different calculations and prioritizations. The specific configuration settings are 6296 retrieved based on the input `name` parameter provided to the function. If the `name` parameter 6297 matches a key in the `config_default` dictionary, the corresponding configuration settings are 6298 returned. If there is no match, an empty dictionary is returned. 6299 """ 6300 6301 config_default = { 6302 "calculations": { 6303 "variant_chr_pos_alt_ref": { 6304 "type": "sql", 6305 "name": "variant_chr_pos_alt_ref", 6306 "description": "Create a variant ID with chromosome, position, alt and ref", 6307 "available": False, 6308 "output_column_name": "variant_chr_pos_alt_ref", 6309 "output_column_type": "String", 6310 "output_column_description": "variant ID with chromosome, position, alt and ref", 6311 "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """, 6312 "operation_info": True, 6313 }, 6314 "VARTYPE": { 6315 "type": "sql", 6316 "name": "VARTYPE", 6317 "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)", 6318 "available": True, 6319 "output_column_name": "VARTYPE", 6320 "output_column_type": "String", 6321 "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ", 6322 "operation_query": """ 6323 CASE 6324 WHEN "SVTYPE" NOT NULL THEN "SVTYPE" 6325 WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV' 6326 WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC' 6327 WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV' 6328 WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL' 6329 ELSE 'UNDEFINED' 6330 END 6331 """, 6332 "info_fields": ["SVTYPE"], 6333 "operation_info": True, 6334 }, 6335 "snpeff_hgvs": { 6336 "type": "python", 6337 "name": "snpeff_hgvs", 6338 "description": "HGVS nomenclatures from snpEff annotation", 6339 "available": True, 6340 "function_name": "calculation_extract_snpeff_hgvs", 6341 "function_params": ["snpeff_hgvs", "ANN"], 6342 }, 6343 "snpeff_ann_explode": { 6344 "type": "python", 6345 "name": "snpeff_ann_explode", 6346 "description": "Explode snpEff annotations with uniquify values", 6347 "available": True, 6348 "function_name": "calculation_snpeff_ann_explode", 6349 "function_params": [False, "fields", "snpeff_", "ANN"], 6350 }, 6351 "snpeff_ann_explode_uniquify": { 6352 "type": "python", 6353 "name": "snpeff_ann_explode_uniquify", 6354 "description": "Explode snpEff annotations", 6355 "available": True, 6356 "function_name": "calculation_snpeff_ann_explode", 6357 "function_params": [True, "fields", "snpeff_uniquify_", "ANN"], 6358 }, 6359 "snpeff_ann_explode_json": { 6360 "type": "python", 6361 "name": "snpeff_ann_explode_json", 6362 "description": "Explode snpEff annotations in JSON format", 6363 "available": True, 6364 "function_name": "calculation_snpeff_ann_explode", 6365 "function_params": [False, "JSON", "snpeff_json", "ANN"], 6366 }, 6367 "NOMEN": { 6368 "type": "python", 6369 "name": "NOMEN", 6370 "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field", 6371 "available": True, 6372 "function_name": "calculation_extract_nomen", 6373 "function_params": [], 6374 }, 6375 "FINDBYPIPELINE": { 6376 "type": "python", 6377 "name": "FINDBYPIPELINE", 6378 "description": "Number of pipeline that identify the variant (for multi pipeline VCF)", 6379 "available": True, 6380 "function_name": "calculation_find_by_pipeline", 6381 "function_params": ["findbypipeline"], 6382 }, 6383 "FINDBYSAMPLE": { 6384 "type": "python", 6385 "name": "FINDBYSAMPLE", 6386 "description": "Number of sample that have a genotype for the variant (for multi sample VCF)", 6387 "available": True, 6388 "function_name": "calculation_find_by_pipeline", 6389 "function_params": ["findbysample"], 6390 }, 6391 "GENOTYPECONCORDANCE": { 6392 "type": "python", 6393 "name": "GENOTYPECONCORDANCE", 6394 "description": "Concordance of genotype for multi caller VCF", 6395 "available": True, 6396 "function_name": "calculation_genotype_concordance", 6397 "function_params": [], 6398 }, 6399 "BARCODE": { 6400 "type": "python", 6401 "name": "BARCODE", 6402 "description": "BARCODE as VaRank tool", 6403 "available": True, 6404 "function_name": "calculation_barcode", 6405 "function_params": [], 6406 }, 6407 "BARCODEFAMILY": { 6408 "type": "python", 6409 "name": "BARCODEFAMILY", 6410 "description": "BARCODEFAMILY as VaRank tool", 6411 "available": True, 6412 "function_name": "calculation_barcode_family", 6413 "function_params": ["BCF"], 6414 }, 6415 "TRIO": { 6416 "type": "python", 6417 "name": "TRIO", 6418 "description": "Inheritance for a trio family", 6419 "available": True, 6420 "function_name": "calculation_trio", 6421 "function_params": [], 6422 }, 6423 "VAF": { 6424 "type": "python", 6425 "name": "VAF", 6426 "description": "Variant Allele Frequency (VAF) harmonization", 6427 "available": True, 6428 "function_name": "calculation_vaf_normalization", 6429 "function_params": [], 6430 }, 6431 "VAF_stats": { 6432 "type": "python", 6433 "name": "VAF_stats", 6434 "description": "Variant Allele Frequency (VAF) statistics", 6435 "available": True, 6436 "function_name": "calculation_genotype_stats", 6437 "function_params": ["VAF"], 6438 }, 6439 "DP_stats": { 6440 "type": "python", 6441 "name": "DP_stats", 6442 "description": "Depth (DP) statistics", 6443 "available": True, 6444 "function_name": "calculation_genotype_stats", 6445 "function_params": ["DP"], 6446 }, 6447 "variant_id": { 6448 "type": "python", 6449 "name": "variant_id", 6450 "description": "Variant ID generated from variant position and type", 6451 "available": True, 6452 "function_name": "calculation_variant_id", 6453 "function_params": [], 6454 }, 6455 "transcripts_json": { 6456 "type": "python", 6457 "name": "transcripts_json", 6458 "description": "Add transcripts info in JSON format (field 'transcripts_json')", 6459 "available": True, 6460 "function_name": "calculation_transcripts_json", 6461 "function_params": ["transcripts_json"], 6462 }, 6463 "transcripts_prioritization": { 6464 "type": "python", 6465 "name": "transcripts_prioritization", 6466 "description": "Prioritize transcripts with a prioritization profile (using param.json)", 6467 "available": True, 6468 "function_name": "calculation_transcripts_prioritization", 6469 "function_params": [], 6470 }, 6471 }, 6472 "prioritizations": { 6473 "default": { 6474 "filter": [ 6475 { 6476 "type": "notequals", 6477 "value": "!PASS|\\.", 6478 "score": 0, 6479 "flag": "FILTERED", 6480 "comment": ["Bad variant quality"], 6481 }, 6482 { 6483 "type": "equals", 6484 "value": "REJECT", 6485 "score": -20, 6486 "flag": "PASS", 6487 "comment": ["Bad variant quality"], 6488 }, 6489 ], 6490 "DP": [ 6491 { 6492 "type": "gte", 6493 "value": "50", 6494 "score": 5, 6495 "flag": "PASS", 6496 "comment": ["DP higher than 50"], 6497 } 6498 ], 6499 "ANN": [ 6500 { 6501 "type": "contains", 6502 "value": "HIGH", 6503 "score": 5, 6504 "flag": "PASS", 6505 "comment": [ 6506 "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay" 6507 ], 6508 }, 6509 { 6510 "type": "contains", 6511 "value": "MODERATE", 6512 "score": 3, 6513 "flag": "PASS", 6514 "comment": [ 6515 "A non-disruptive variant that might change protein effectiveness" 6516 ], 6517 }, 6518 { 6519 "type": "contains", 6520 "value": "LOW", 6521 "score": 0, 6522 "flag": "FILTERED", 6523 "comment": [ 6524 "Assumed to be mostly harmless or unlikely to change protein behavior" 6525 ], 6526 }, 6527 { 6528 "type": "contains", 6529 "value": "MODIFIER", 6530 "score": 0, 6531 "flag": "FILTERED", 6532 "comment": [ 6533 "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact" 6534 ], 6535 }, 6536 ], 6537 } 6538 }, 6539 } 6540 6541 return config_default.get(name, None)
The function get_config_default returns a dictionary containing default configurations for
various calculations and prioritizations.
Parameters
- name: The
get_config_defaultfunction returns a dictionary containing default configurations for different calculations and prioritizations. Thenameparameter is used to specify which specific configuration to retrieve from the dictionary
Returns
The function
get_config_defaultreturns a dictionary containing default configuration settings for different calculations and prioritizations. The specific configuration settings are retrieved based on the inputnameparameter provided to the function. If thenameparameter matches a key in theconfig_defaultdictionary, the corresponding configuration settings are returned. If there is no match, an empty dictionary is returned.
6543 def get_config_json( 6544 self, name: str, config_dict: dict = {}, config_file: str = None 6545 ) -> dict: 6546 """ 6547 The function `get_config_json` retrieves a configuration JSON object with prioritizations from 6548 default values, a dictionary, and a file. 6549 6550 :param name: The `name` parameter in the `get_config_json` function is a string that represents 6551 the name of the configuration. It is used to identify and retrieve the configuration settings 6552 for a specific component or module 6553 :type name: str 6554 :param config_dict: The `config_dict` parameter in the `get_config_json` function is a 6555 dictionary that allows you to provide additional configuration settings or overrides. When you 6556 call the `get_config_json` function, you can pass a dictionary containing key-value pairs where 6557 the key is the configuration setting you want to override or 6558 :type config_dict: dict 6559 :param config_file: The `config_file` parameter in the `get_config_json` function is used to 6560 specify the path to a configuration file that contains additional settings. If provided, the 6561 function will read the contents of this file and update the configuration dictionary with the 6562 values found in the file, overriding any existing values with the 6563 :type config_file: str 6564 :return: The function `get_config_json` returns a dictionary containing the configuration 6565 settings. 6566 """ 6567 6568 # Create with default prioritizations 6569 config_default = self.get_config_default(name=name) 6570 configuration = config_default 6571 # log.debug(f"configuration={configuration}") 6572 6573 # Replace prioritizations from dict 6574 for config in config_dict: 6575 configuration[config] = config_dict[config] 6576 6577 # Replace prioritizations from file 6578 config_file = full_path(config_file) 6579 if config_file: 6580 if os.path.exists(config_file): 6581 with open(config_file) as config_file_content: 6582 config_file_dict = json.load(config_file_content) 6583 for config in config_file_dict: 6584 configuration[config] = config_file_dict[config] 6585 else: 6586 msg_error = f"Config '{name}' file '{config_file}' does NOT exist" 6587 log.error(msg_error) 6588 raise ValueError(msg_error) 6589 6590 return configuration
The function get_config_json retrieves a configuration JSON object with prioritizations from
default values, a dictionary, and a file.
Parameters
- name: The
nameparameter in theget_config_jsonfunction is a string that represents the name of the configuration. It is used to identify and retrieve the configuration settings for a specific component or module - config_dict: The
config_dictparameter in theget_config_jsonfunction is a dictionary that allows you to provide additional configuration settings or overrides. When you call theget_config_jsonfunction, you can pass a dictionary containing key-value pairs where the key is the configuration setting you want to override or - config_file: The
config_fileparameter in theget_config_jsonfunction is used to specify the path to a configuration file that contains additional settings. If provided, the function will read the contents of this file and update the configuration dictionary with the values found in the file, overriding any existing values with the
Returns
The function
get_config_jsonreturns a dictionary containing the configuration settings.
6592 def prioritization( 6593 self, table: str = None, pz_prefix: str = None, pz_param: dict = None 6594 ) -> bool: 6595 """ 6596 The `prioritization` function in Python processes VCF files, adds new INFO fields, and 6597 prioritizes variants based on configured profiles and criteria. 6598 6599 :param table: The `table` parameter in the `prioritization` function is used to specify the name 6600 of the table (presumably a VCF file) on which the prioritization operation will be performed. If 6601 a table name is provided, the method will prioritize the variants in that specific table 6602 :type table: str 6603 :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to 6604 certain INFO fields in a VCF file during the prioritization process. If this parameter is not 6605 provided, the code will use a default prefix value of "PZ" 6606 :type pz_prefix: str 6607 :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass 6608 additional parameters specific to the prioritization process. These parameters can include 6609 settings related to prioritization profiles, fields, scoring modes, flags, comments, and other 6610 configurations needed for the prioritization of variants in a V 6611 :type pz_param: dict 6612 :return: A boolean value (True) is being returned from the `prioritization` function. 6613 """ 6614 6615 # Config 6616 config = self.get_config() 6617 6618 # Param 6619 param = self.get_param() 6620 6621 # Prioritization param 6622 if pz_param is not None: 6623 prioritization_param = pz_param 6624 else: 6625 prioritization_param = param.get("prioritization", {}) 6626 6627 # Configuration profiles 6628 prioritization_config_file = prioritization_param.get( 6629 "prioritization_config", None 6630 ) 6631 prioritization_config_file = full_path(prioritization_config_file) 6632 prioritizations_config = self.get_config_json( 6633 name="prioritizations", config_file=prioritization_config_file 6634 ) 6635 6636 # Prioritization prefix 6637 pz_prefix_default = "PZ" 6638 if pz_prefix is None: 6639 pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default) 6640 6641 # Prioritization options 6642 profiles = prioritization_param.get("profiles", []) 6643 if isinstance(profiles, str): 6644 profiles = profiles.split(",") 6645 pzfields = prioritization_param.get( 6646 "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"] 6647 ) 6648 if isinstance(pzfields, str): 6649 pzfields = pzfields.split(",") 6650 default_profile = prioritization_param.get("default_profile", None) 6651 pzfields_sep = prioritization_param.get("pzfields_sep", "_") 6652 prioritization_score_mode = prioritization_param.get( 6653 "prioritization_score_mode", "HOWARD" 6654 ) 6655 6656 # Quick Prioritizations 6657 prioritizations = param.get("prioritizations", None) 6658 if prioritizations: 6659 log.info("Quick Prioritization:") 6660 for profile in prioritizations.split(","): 6661 if profile not in profiles: 6662 profiles.append(profile) 6663 log.info(f" {profile}") 6664 6665 # If profile "ALL" provided, all profiles in the config profiles 6666 if "ALL" in profiles: 6667 profiles = list(prioritizations_config.keys()) 6668 6669 for profile in profiles: 6670 if prioritizations_config.get(profile, None): 6671 log.debug(f"Profile '{profile}' configured") 6672 else: 6673 msg_error = f"Profile '{profile}' NOT configured" 6674 log.error(msg_error) 6675 raise ValueError(msg_error) 6676 6677 if profiles: 6678 log.info(f"Prioritization... ") 6679 else: 6680 log.debug(f"No profile defined") 6681 return False 6682 6683 if not default_profile and len(profiles): 6684 default_profile = profiles[0] 6685 6686 log.debug("Profiles availables: " + str(list(prioritizations_config.keys()))) 6687 log.debug("Profiles to check: " + str(list(profiles))) 6688 6689 # Variables 6690 if table is not None: 6691 table_variants = table 6692 else: 6693 table_variants = self.get_table_variants(clause="update") 6694 log.debug(f"Table to prioritize: {table_variants}") 6695 6696 # Added columns 6697 added_columns = [] 6698 6699 # Create list of PZfields 6700 # List of PZFields 6701 list_of_pzfields_original = pzfields + [ 6702 pzfield + pzfields_sep + profile 6703 for pzfield in pzfields 6704 for profile in profiles 6705 ] 6706 list_of_pzfields = [] 6707 log.debug(f"{list_of_pzfields_original}") 6708 6709 # Remove existing PZfields to use if exists 6710 for pzfield in list_of_pzfields_original: 6711 if self.get_header().infos.get(pzfield, None) is None: 6712 list_of_pzfields.append(pzfield) 6713 log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF") 6714 else: 6715 log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF") 6716 6717 if list_of_pzfields: 6718 6719 # Explode Infos prefix 6720 explode_infos_prefix = self.get_explode_infos_prefix() 6721 6722 # PZfields tags description 6723 PZfields_INFOS = { 6724 f"{pz_prefix}Tags": { 6725 "ID": f"{pz_prefix}Tags", 6726 "Number": ".", 6727 "Type": "String", 6728 "Description": "Variant tags based on annotation criteria", 6729 }, 6730 f"{pz_prefix}Score": { 6731 "ID": f"{pz_prefix}Score", 6732 "Number": 1, 6733 "Type": "Integer", 6734 "Description": "Variant score based on annotation criteria", 6735 }, 6736 f"{pz_prefix}Flag": { 6737 "ID": f"{pz_prefix}Flag", 6738 "Number": 1, 6739 "Type": "String", 6740 "Description": "Variant flag based on annotation criteria", 6741 }, 6742 f"{pz_prefix}Comment": { 6743 "ID": f"{pz_prefix}Comment", 6744 "Number": ".", 6745 "Type": "String", 6746 "Description": "Variant comment based on annotation criteria", 6747 }, 6748 f"{pz_prefix}Infos": { 6749 "ID": f"{pz_prefix}Infos", 6750 "Number": ".", 6751 "Type": "String", 6752 "Description": "Variant infos based on annotation criteria", 6753 }, 6754 } 6755 6756 # Create INFO fields if not exist 6757 for field in PZfields_INFOS: 6758 field_ID = PZfields_INFOS[field]["ID"] 6759 field_description = PZfields_INFOS[field]["Description"] 6760 if field_ID not in self.get_header().infos and field_ID in pzfields: 6761 field_description = ( 6762 PZfields_INFOS[field]["Description"] 6763 + f", profile {default_profile}" 6764 ) 6765 self.get_header().infos[field_ID] = vcf.parser._Info( 6766 field_ID, 6767 PZfields_INFOS[field]["Number"], 6768 PZfields_INFOS[field]["Type"], 6769 field_description, 6770 "unknown", 6771 "unknown", 6772 code_type_map[PZfields_INFOS[field]["Type"]], 6773 ) 6774 6775 # Create INFO fields if not exist for each profile 6776 for profile in prioritizations_config: 6777 if profile in profiles or profiles == []: 6778 for field in PZfields_INFOS: 6779 field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile 6780 field_description = ( 6781 PZfields_INFOS[field]["Description"] 6782 + f", profile {profile}" 6783 ) 6784 if ( 6785 field_ID not in self.get_header().infos 6786 and field in pzfields 6787 ): 6788 self.get_header().infos[field_ID] = vcf.parser._Info( 6789 field_ID, 6790 PZfields_INFOS[field]["Number"], 6791 PZfields_INFOS[field]["Type"], 6792 field_description, 6793 "unknown", 6794 "unknown", 6795 code_type_map[PZfields_INFOS[field]["Type"]], 6796 ) 6797 6798 # Header 6799 for pzfield in list_of_pzfields: 6800 if re.match(f"{pz_prefix}Score.*", pzfield): 6801 added_column = self.add_column( 6802 table_name=table_variants, 6803 column_name=pzfield, 6804 column_type="INTEGER", 6805 default_value="0", 6806 ) 6807 elif re.match(f"{pz_prefix}Flag.*", pzfield): 6808 added_column = self.add_column( 6809 table_name=table_variants, 6810 column_name=pzfield, 6811 column_type="BOOLEAN", 6812 default_value="1", 6813 ) 6814 else: 6815 added_column = self.add_column( 6816 table_name=table_variants, 6817 column_name=pzfield, 6818 column_type="STRING", 6819 default_value="''", 6820 ) 6821 added_columns.append(added_column) 6822 6823 # Profiles 6824 if profiles: 6825 6826 # foreach profile in configuration file 6827 for profile in prioritizations_config: 6828 6829 # If profile is asked in param, or ALL are asked (empty profile []) 6830 if profile in profiles or profiles == []: 6831 log.info(f"Profile '{profile}'") 6832 6833 sql_set_info_option = "" 6834 6835 sql_set_info = [] 6836 6837 # PZ fields set 6838 6839 # PZScore 6840 if ( 6841 f"{pz_prefix}Score{pzfields_sep}{profile}" 6842 in list_of_pzfields 6843 ): 6844 sql_set_info.append( 6845 f""" 6846 concat( 6847 '{pz_prefix}Score{pzfields_sep}{profile}=', 6848 {pz_prefix}Score{pzfields_sep}{profile} 6849 ) 6850 """ 6851 ) 6852 if ( 6853 profile == default_profile 6854 and f"{pz_prefix}Score" in list_of_pzfields 6855 ): 6856 sql_set_info.append( 6857 f""" 6858 concat( 6859 '{pz_prefix}Score=', 6860 {pz_prefix}Score{pzfields_sep}{profile} 6861 ) 6862 """ 6863 ) 6864 6865 # PZFlag 6866 if ( 6867 f"{pz_prefix}Flag{pzfields_sep}{profile}" 6868 in list_of_pzfields 6869 ): 6870 sql_set_info.append( 6871 f""" 6872 concat( 6873 '{pz_prefix}Flag{pzfields_sep}{profile}=', 6874 CASE 6875 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 6876 THEN 'PASS' 6877 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 6878 THEN 'FILTERED' 6879 END 6880 ) 6881 """ 6882 ) 6883 if ( 6884 profile == default_profile 6885 and f"{pz_prefix}Flag" in list_of_pzfields 6886 ): 6887 sql_set_info.append( 6888 f""" 6889 concat( 6890 '{pz_prefix}Flag=', 6891 CASE 6892 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 6893 THEN 'PASS' 6894 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 6895 THEN 'FILTERED' 6896 END 6897 ) 6898 """ 6899 ) 6900 6901 # PZComment 6902 if ( 6903 f"{pz_prefix}Comment{pzfields_sep}{profile}" 6904 in list_of_pzfields 6905 ): 6906 sql_set_info.append( 6907 f""" 6908 CASE 6909 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 6910 THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile}) 6911 ELSE '' 6912 END 6913 """ 6914 ) 6915 if ( 6916 profile == default_profile 6917 and f"{pz_prefix}Comment" in list_of_pzfields 6918 ): 6919 sql_set_info.append( 6920 f""" 6921 CASE 6922 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 6923 THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile}) 6924 ELSE '' 6925 END 6926 """ 6927 ) 6928 6929 # PZInfos 6930 if ( 6931 f"{pz_prefix}Infos{pzfields_sep}{profile}" 6932 in list_of_pzfields 6933 ): 6934 sql_set_info.append( 6935 f""" 6936 CASE 6937 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 6938 THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile}) 6939 ELSE '' 6940 END 6941 """ 6942 ) 6943 if ( 6944 profile == default_profile 6945 and f"{pz_prefix}Infos" in list_of_pzfields 6946 ): 6947 sql_set_info.append( 6948 f""" 6949 CASE 6950 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 6951 THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile}) 6952 ELSE '' 6953 END 6954 """ 6955 ) 6956 6957 # Merge PZfields 6958 sql_set_info_option = "" 6959 sql_set_sep = "" 6960 for sql_set in sql_set_info: 6961 if sql_set_sep: 6962 sql_set_info_option += f""" 6963 , concat('{sql_set_sep}', {sql_set}) 6964 """ 6965 else: 6966 sql_set_info_option += f""" 6967 , {sql_set} 6968 """ 6969 sql_set_sep = ";" 6970 6971 sql_queries = [] 6972 for annotation in prioritizations_config[profile]: 6973 6974 # Explode specific annotation 6975 log.debug(f"Explode annotation '{annotation}'") 6976 added_columns += self.explode_infos( 6977 prefix=explode_infos_prefix, 6978 fields=[annotation], 6979 table=table_variants, 6980 ) 6981 extra_infos = self.get_extra_infos(table=table_variants) 6982 6983 # Check if annotation field is present 6984 if not f"{explode_infos_prefix}{annotation}" in extra_infos: 6985 log.debug(f"Annotation '{annotation}' not in data") 6986 continue 6987 else: 6988 log.debug(f"Annotation '{annotation}' in data") 6989 6990 # For each criterions 6991 for criterion in prioritizations_config[profile][ 6992 annotation 6993 ]: 6994 criterion_type = criterion["type"] 6995 criterion_value = criterion["value"] 6996 criterion_score = criterion.get("score", 0) 6997 criterion_flag = criterion.get("flag", "PASS") 6998 criterion_flag_bool = criterion_flag == "PASS" 6999 criterion_comment = ( 7000 ", ".join(criterion.get("comment", [])) 7001 .replace("'", "''") 7002 .replace(";", ",") 7003 .replace("\t", " ") 7004 ) 7005 criterion_infos = ( 7006 str(criterion) 7007 .replace("'", "''") 7008 .replace(";", ",") 7009 .replace("\t", " ") 7010 ) 7011 7012 sql_set = [] 7013 sql_set_info = [] 7014 7015 # PZ fields set 7016 if ( 7017 f"{pz_prefix}Score{pzfields_sep}{profile}" 7018 in list_of_pzfields 7019 ): 7020 if prioritization_score_mode == "HOWARD": 7021 sql_set.append( 7022 f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7023 ) 7024 elif prioritization_score_mode == "VaRank": 7025 sql_set.append( 7026 f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} END" 7027 ) 7028 else: 7029 sql_set.append( 7030 f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7031 ) 7032 if ( 7033 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7034 in list_of_pzfields 7035 ): 7036 sql_set.append( 7037 f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}" 7038 ) 7039 if ( 7040 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7041 in list_of_pzfields 7042 ): 7043 sql_set.append( 7044 f""" 7045 {pz_prefix}Comment{pzfields_sep}{profile} = 7046 concat( 7047 {pz_prefix}Comment{pzfields_sep}{profile}, 7048 CASE 7049 WHEN {pz_prefix}Comment{pzfields_sep}{profile}!='' 7050 THEN ', ' 7051 ELSE '' 7052 END, 7053 '{criterion_comment}' 7054 ) 7055 """ 7056 ) 7057 if ( 7058 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7059 in list_of_pzfields 7060 ): 7061 sql_set.append( 7062 f""" 7063 {pz_prefix}Infos{pzfields_sep}{profile} = 7064 concat( 7065 {pz_prefix}Infos{pzfields_sep}{profile}, 7066 '{criterion_infos}' 7067 ) 7068 """ 7069 ) 7070 sql_set_option = ",".join(sql_set) 7071 7072 # Criterion and comparison 7073 if sql_set_option: 7074 try: 7075 float(criterion_value) 7076 sql_update = f""" 7077 UPDATE {table_variants} 7078 SET {sql_set_option} 7079 WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.') 7080 AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value} 7081 """ 7082 except: 7083 contains_option = "" 7084 if criterion_type == "contains": 7085 contains_option = ".*" 7086 sql_update = f""" 7087 UPDATE {table_variants} 7088 SET {sql_set_option} 7089 WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}' 7090 """ 7091 sql_queries.append(sql_update) 7092 else: 7093 log.warning( 7094 f"NO SQL SET option for '{annotation}' - '{criterion}'" 7095 ) 7096 7097 # PZTags 7098 if ( 7099 f"{pz_prefix}Tags{pzfields_sep}{profile}" 7100 in list_of_pzfields 7101 ): 7102 7103 # Create PZFalgs value 7104 pztags_value = "" 7105 pztags_sep_default = "|" 7106 pztags_sep = "" 7107 for pzfield in pzfields: 7108 if pzfield not in [f"{pz_prefix}Tags"]: 7109 if ( 7110 f"{pzfield}{pzfields_sep}{profile}" 7111 in list_of_pzfields 7112 ): 7113 if pzfield in [f"{pz_prefix}Flag"]: 7114 pztags_value += f"""{pztags_sep}{pzfield}#', 7115 CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile} 7116 THEN 'PASS' 7117 ELSE 'FILTERED' 7118 END, '""" 7119 else: 7120 pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '" 7121 pztags_sep = pztags_sep_default 7122 7123 # Add Query update for PZFlags 7124 sql_update_pztags = f""" 7125 UPDATE {table_variants} 7126 SET INFO = concat( 7127 INFO, 7128 CASE WHEN INFO NOT in ('','.') 7129 THEN ';' 7130 ELSE '' 7131 END, 7132 '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}' 7133 ) 7134 """ 7135 sql_queries.append(sql_update_pztags) 7136 7137 # Add Query update for PZFlags for default 7138 if profile == default_profile: 7139 sql_update_pztags_default = f""" 7140 UPDATE {table_variants} 7141 SET INFO = concat( 7142 INFO, 7143 ';', 7144 '{pz_prefix}Tags={pztags_value}' 7145 ) 7146 """ 7147 sql_queries.append(sql_update_pztags_default) 7148 7149 log.info(f"""Profile '{profile}' - Prioritization... """) 7150 7151 if sql_queries: 7152 7153 for sql_query in sql_queries: 7154 log.debug( 7155 f"""Profile '{profile}' - Prioritization query: {sql_query}... """ 7156 ) 7157 self.conn.execute(sql_query) 7158 7159 log.info(f"""Profile '{profile}' - Update... """) 7160 sql_query_update = f""" 7161 UPDATE {table_variants} 7162 SET INFO = 7163 concat( 7164 CASE 7165 WHEN INFO NOT IN ('','.') 7166 THEN concat(INFO, ';') 7167 ELSE '' 7168 END 7169 {sql_set_info_option} 7170 ) 7171 """ 7172 self.conn.execute(sql_query_update) 7173 7174 else: 7175 7176 log.warning(f"No profiles in parameters") 7177 7178 # Remove added columns 7179 for added_column in added_columns: 7180 self.drop_column(column=added_column) 7181 7182 # Explode INFOS fields into table fields 7183 if self.get_explode_infos(): 7184 self.explode_infos( 7185 prefix=self.get_explode_infos_prefix(), 7186 fields=self.get_explode_infos_fields(), 7187 force=True, 7188 ) 7189 7190 return True
The prioritization function in Python processes VCF files, adds new INFO fields, and
prioritizes variants based on configured profiles and criteria.
Parameters
- table: The
tableparameter in theprioritizationfunction is used to specify the name of the table (presumably a VCF file) on which the prioritization operation will be performed. If a table name is provided, the method will prioritize the variants in that specific table - pz_prefix: The
pz_prefixparameter is used to specify a prefix that will be added to certain INFO fields in a VCF file during the prioritization process. If this parameter is not provided, the code will use a default prefix value of "PZ" - pz_param: The
pz_paramparameter in theprioritizationmethod is used to pass additional parameters specific to the prioritization process. These parameters can include settings related to prioritization profiles, fields, scoring modes, flags, comments, and other configurations needed for the prioritization of variants in a V
Returns
A boolean value (True) is being returned from the
prioritizationfunction.
7196 def annotation_hgvs(self, threads: int = None) -> None: 7197 """ 7198 The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic 7199 coordinates and alleles. 7200 7201 :param threads: The `threads` parameter is an optional integer that specifies the number of 7202 threads to use for parallel processing. If no value is provided, it will default to the number 7203 of threads obtained from the `get_threads()` method 7204 :type threads: int 7205 """ 7206 7207 # Function for each partition of the Dask Dataframe 7208 def partition_function(partition): 7209 """ 7210 The function `partition_function` applies the `annotation_hgvs_partition` function to 7211 each row of a DataFrame called `partition`. 7212 7213 :param partition: The parameter "partition" is a pandas DataFrame that contains the data 7214 to be processed 7215 :return: the result of applying the "annotation_hgvs_partition" function to each row of 7216 the "partition" dataframe along the axis 1. 7217 """ 7218 return partition.apply(annotation_hgvs_partition, axis=1) 7219 7220 def annotation_hgvs_partition(row) -> str: 7221 """ 7222 The function `annotation_hgvs_partition` takes in a row of data and returns a string 7223 containing a list of HGVS names associated with the given genomic coordinates and alleles. 7224 7225 :param row: A dictionary-like object that contains the values for the following keys: 7226 :return: a string that contains the HGVS names associated with the given row of data. 7227 """ 7228 7229 chr = row["CHROM"] 7230 pos = row["POS"] 7231 ref = row["REF"] 7232 alt = row["ALT"] 7233 7234 # Find list of associated transcripts 7235 transcripts_list = list( 7236 polars_conn.execute( 7237 f""" 7238 SELECT transcript 7239 FROM refseq_df 7240 WHERE CHROM='{chr}' 7241 AND POS={pos} 7242 """ 7243 )["transcript"] 7244 ) 7245 7246 # Full HGVS annotation in list 7247 hgvs_full_list = [] 7248 7249 for transcript_name in transcripts_list: 7250 7251 # Transcript 7252 transcript = get_transcript( 7253 transcripts=transcripts, transcript_name=transcript_name 7254 ) 7255 # Exon 7256 if use_exon: 7257 exon = transcript.find_exon_number(pos) 7258 else: 7259 exon = None 7260 # Protein 7261 transcript_protein = None 7262 if use_protein or add_protein or full_format: 7263 transcripts_protein = list( 7264 polars_conn.execute( 7265 f""" 7266 SELECT protein 7267 FROM refseqlink_df 7268 WHERE transcript='{transcript_name}' 7269 LIMIT 1 7270 """ 7271 )["protein"] 7272 ) 7273 if len(transcripts_protein): 7274 transcript_protein = transcripts_protein[0] 7275 7276 # HGVS name 7277 hgvs_name = format_hgvs_name( 7278 chr, 7279 pos, 7280 ref, 7281 alt, 7282 genome=genome, 7283 transcript=transcript, 7284 transcript_protein=transcript_protein, 7285 exon=exon, 7286 use_gene=use_gene, 7287 use_protein=use_protein, 7288 full_format=full_format, 7289 use_version=use_version, 7290 codon_type=codon_type, 7291 ) 7292 hgvs_full_list.append(hgvs_name) 7293 if add_protein and not use_protein and not full_format: 7294 hgvs_name = format_hgvs_name( 7295 chr, 7296 pos, 7297 ref, 7298 alt, 7299 genome=genome, 7300 transcript=transcript, 7301 transcript_protein=transcript_protein, 7302 exon=exon, 7303 use_gene=use_gene, 7304 use_protein=True, 7305 full_format=False, 7306 use_version=use_version, 7307 codon_type=codon_type, 7308 ) 7309 hgvs_full_list.append(hgvs_name) 7310 7311 # Create liste of HGVS annotations 7312 hgvs_full = ",".join(hgvs_full_list) 7313 7314 return hgvs_full 7315 7316 # Polars connexion 7317 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7318 7319 # Config 7320 config = self.get_config() 7321 7322 # Databases 7323 # Genome 7324 databases_genomes_folders = ( 7325 config.get("folders", {}) 7326 .get("databases", {}) 7327 .get("genomes", DEFAULT_GENOME_FOLDER) 7328 ) 7329 databases_genome = ( 7330 config.get("folders", {}).get("databases", {}).get("genomes", "") 7331 ) 7332 # refseq database folder 7333 databases_refseq_folders = ( 7334 config.get("folders", {}) 7335 .get("databases", {}) 7336 .get("refseq", DEFAULT_REFSEQ_FOLDER) 7337 ) 7338 # refseq 7339 databases_refseq = config.get("databases", {}).get("refSeq", None) 7340 # refSeqLink 7341 databases_refseqlink = config.get("databases", {}).get("refSeqLink", None) 7342 7343 # Param 7344 param = self.get_param() 7345 7346 # Quick HGVS 7347 if "hgvs_options" in param and param.get("hgvs_options", ""): 7348 log.info(f"Quick HGVS Annotation:") 7349 if not param.get("hgvs", None): 7350 param["hgvs"] = {} 7351 for option in param.get("hgvs_options", "").split(","): 7352 option_var_val = option.split("=") 7353 option_var = option_var_val[0] 7354 if len(option_var_val) > 1: 7355 option_val = option_var_val[1] 7356 else: 7357 option_val = "True" 7358 if option_val.upper() in ["TRUE"]: 7359 option_val = True 7360 elif option_val.upper() in ["FALSE"]: 7361 option_val = False 7362 log.info(f" {option_var}={option_val}") 7363 param["hgvs"][option_var] = option_val 7364 7365 # Check if HGVS annotation enabled 7366 if "hgvs" in param: 7367 log.info(f"HGVS Annotation... ") 7368 for hgvs_option in param.get("hgvs", {}): 7369 log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}") 7370 else: 7371 return 7372 7373 # HGVS Param 7374 param_hgvs = param.get("hgvs", {}) 7375 use_exon = param_hgvs.get("use_exon", False) 7376 use_gene = param_hgvs.get("use_gene", False) 7377 use_protein = param_hgvs.get("use_protein", False) 7378 add_protein = param_hgvs.get("add_protein", False) 7379 full_format = param_hgvs.get("full_format", False) 7380 use_version = param_hgvs.get("use_version", False) 7381 codon_type = param_hgvs.get("codon_type", "3") 7382 7383 # refSseq refSeqLink 7384 databases_refseq = param_hgvs.get("refseq", databases_refseq) 7385 databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink) 7386 7387 # Assembly 7388 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 7389 7390 # Genome 7391 genome_file = None 7392 if find_genome(databases_genome): 7393 genome_file = find_genome(databases_genome) 7394 else: 7395 genome_file = find_genome( 7396 genome_path=databases_genomes_folders, assembly=assembly 7397 ) 7398 log.debug("Genome: " + str(genome_file)) 7399 7400 # refSseq 7401 refseq_file = find_file_prefix( 7402 input_file=databases_refseq, 7403 prefix="ncbiRefSeq", 7404 folder=databases_refseq_folders, 7405 assembly=assembly, 7406 ) 7407 log.debug("refSeq: " + str(refseq_file)) 7408 7409 # refSeqLink 7410 refseqlink_file = find_file_prefix( 7411 input_file=databases_refseqlink, 7412 prefix="ncbiRefSeqLink", 7413 folder=databases_refseq_folders, 7414 assembly=assembly, 7415 ) 7416 log.debug("refSeqLink: " + str(refseqlink_file)) 7417 7418 # Threads 7419 if not threads: 7420 threads = self.get_threads() 7421 log.debug("Threads: " + str(threads)) 7422 7423 # Variables 7424 table_variants = self.get_table_variants(clause="update") 7425 7426 # Get variants SNV and InDel only 7427 query_variants = f""" 7428 SELECT "#CHROM" AS CHROM, POS, REF, ALT 7429 FROM {table_variants} 7430 WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$' 7431 """ 7432 df_variants = self.get_query_to_df(query_variants) 7433 7434 # Added columns 7435 added_columns = [] 7436 7437 # Add hgvs column in variants table 7438 hgvs_column_name = "hgvs_" + str(random.randrange(1000)) 7439 added_column = self.add_column( 7440 table_variants, hgvs_column_name, "STRING", default_value=None 7441 ) 7442 added_columns.append(added_column) 7443 7444 log.debug(f"refSeq loading...") 7445 # refSeq in duckDB 7446 refseq_table = get_refseq_table( 7447 conn=self.conn, refseq_table="refseq", refseq_file=refseq_file 7448 ) 7449 # Loading all refSeq in Dataframe 7450 refseq_query = f""" 7451 SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript 7452 FROM {refseq_table} 7453 JOIN df_variants ON ( 7454 {refseq_table}.chrom = df_variants.CHROM 7455 AND {refseq_table}.txStart<=df_variants.POS 7456 AND {refseq_table}.txEnd>=df_variants.POS 7457 ) 7458 """ 7459 refseq_df = self.conn.query(refseq_query).pl() 7460 7461 if refseqlink_file: 7462 log.debug(f"refSeqLink loading...") 7463 # refSeqLink in duckDB 7464 refseqlink_table = get_refseq_table( 7465 conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file 7466 ) 7467 # Loading all refSeqLink in Dataframe 7468 protacc_column = "protAcc_with_ver" 7469 mrnaacc_column = "mrnaAcc_with_ver" 7470 refseqlink_query = f""" 7471 SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript 7472 FROM {refseqlink_table} 7473 JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver) 7474 WHERE protAcc_without_ver IS NOT NULL 7475 """ 7476 # Polars Dataframe 7477 refseqlink_df = self.conn.query(f"{refseqlink_query}").pl() 7478 7479 # Read RefSeq transcripts into a python dict/model. 7480 log.debug(f"Transcripts loading...") 7481 with tempfile.TemporaryDirectory() as tmpdir: 7482 transcripts_query = f""" 7483 COPY ( 7484 SELECT {refseq_table}.* 7485 FROM {refseq_table} 7486 JOIN df_variants ON ( 7487 {refseq_table}.chrom=df_variants.CHROM 7488 AND {refseq_table}.txStart<=df_variants.POS 7489 AND {refseq_table}.txEnd>=df_variants.POS 7490 ) 7491 ) 7492 TO '{tmpdir}/transcript.tsv' (DELIMITER '\t'); 7493 """ 7494 self.conn.query(transcripts_query) 7495 with open(f"{tmpdir}/transcript.tsv") as infile: 7496 transcripts = read_transcripts(infile) 7497 7498 # Polars connexion 7499 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7500 7501 log.debug("Genome loading...") 7502 # Read genome sequence using pyfaidx. 7503 genome = Fasta(genome_file) 7504 7505 log.debug("Start annotation HGVS...") 7506 7507 # Create 7508 # a Dask Dataframe from Pandas dataframe with partition as number of threads 7509 ddf = dd.from_pandas(df_variants, npartitions=threads) 7510 7511 # Use dask.dataframe.apply() to apply function on each partition 7512 ddf[hgvs_column_name] = ddf.map_partitions(partition_function) 7513 7514 # Convert Dask DataFrame to Pandas Dataframe 7515 df = ddf.compute() 7516 7517 # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???) 7518 with tempfile.TemporaryDirectory() as tmpdir: 7519 df_parquet = os.path.join(tmpdir, "df.parquet") 7520 df.to_parquet(df_parquet) 7521 7522 # Update hgvs column 7523 update_variant_query = f""" 7524 UPDATE {table_variants} 7525 SET "{hgvs_column_name}"=df."{hgvs_column_name}" 7526 FROM read_parquet('{df_parquet}') as df 7527 WHERE variants."#CHROM" = df.CHROM 7528 AND variants.POS = df.POS 7529 AND variants.REF = df.REF 7530 AND variants.ALT = df.ALT 7531 AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL 7532 """ 7533 self.execute_query(update_variant_query) 7534 7535 # Update INFO column 7536 sql_query_update = f""" 7537 UPDATE {table_variants} 7538 SET INFO = 7539 concat( 7540 CASE 7541 WHEN INFO NOT IN ('','.') 7542 THEN concat(INFO, ';') 7543 ELSE '' 7544 END, 7545 'hgvs=', 7546 {hgvs_column_name} 7547 ) 7548 WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL 7549 """ 7550 self.execute_query(sql_query_update) 7551 7552 # Add header 7553 HGVS_INFOS = { 7554 "hgvs": { 7555 "ID": "hgvs", 7556 "Number": ".", 7557 "Type": "String", 7558 "Description": f"HGVS annotatation with HOWARD", 7559 } 7560 } 7561 7562 for field in HGVS_INFOS: 7563 field_ID = HGVS_INFOS[field]["ID"] 7564 field_description = HGVS_INFOS[field]["Description"] 7565 self.get_header().infos[field_ID] = vcf.parser._Info( 7566 field_ID, 7567 HGVS_INFOS[field]["Number"], 7568 HGVS_INFOS[field]["Type"], 7569 field_description, 7570 "unknown", 7571 "unknown", 7572 code_type_map[HGVS_INFOS[field]["Type"]], 7573 ) 7574 7575 # Remove added columns 7576 for added_column in added_columns: 7577 self.drop_column(column=added_column)
The annotation_hgvs function performs HGVS annotation on a set of variants using genomic
coordinates and alleles.
Parameters
- threads: The
threadsparameter is an optional integer that specifies the number of threads to use for parallel processing. If no value is provided, it will default to the number of threads obtained from theget_threads()method
7583 def get_operations_help( 7584 self, operations_config_dict: dict = {}, operations_config_file: str = None 7585 ) -> list: 7586 7587 # Init 7588 operations_help = [] 7589 7590 # operations 7591 operations = self.get_config_json( 7592 name="calculations", 7593 config_dict=operations_config_dict, 7594 config_file=operations_config_file, 7595 ) 7596 for op in operations: 7597 op_name = operations[op].get("name", op).upper() 7598 op_description = operations[op].get("description", op_name) 7599 op_available = operations[op].get("available", False) 7600 if op_available: 7601 operations_help.append(f" {op_name}: {op_description}") 7602 7603 # Sort operations 7604 operations_help.sort() 7605 7606 # insert header 7607 operations_help.insert(0, "Available calculation operations:") 7608 7609 # Return 7610 return operations_help
7612 def calculation( 7613 self, 7614 operations: dict = {}, 7615 operations_config_dict: dict = {}, 7616 operations_config_file: str = None, 7617 ) -> None: 7618 """ 7619 It takes a list of operations, and for each operation, it checks if it's a python or sql 7620 operation, and then calls the appropriate function 7621 7622 param json example: 7623 "calculation": { 7624 "NOMEN": { 7625 "options": { 7626 "hgvs_field": "hgvs" 7627 }, 7628 "middle" : null 7629 } 7630 """ 7631 7632 # Param 7633 param = self.get_param() 7634 7635 # operations config 7636 operations_config = self.get_config_json( 7637 name="calculations", 7638 config_dict=operations_config_dict, 7639 config_file=operations_config_file, 7640 ) 7641 7642 # Upper keys 7643 operations_config = {k.upper(): v for k, v in operations_config.items()} 7644 7645 # Calculations 7646 7647 # Operations from param 7648 operations = param.get("calculation", {}).get("calculations", operations) 7649 7650 # Quick calculation - add 7651 if param.get("calculations", None): 7652 calculations_list = [ 7653 value for value in param.get("calculations", "").split(",") 7654 ] 7655 log.info(f"Quick Calculations:") 7656 for calculation_key in calculations_list: 7657 log.info(f" {calculation_key}") 7658 for calculation_operation in calculations_list: 7659 if calculation_operation.upper() not in operations: 7660 operations[calculation_operation.upper()] = {} 7661 add_value_into_dict( 7662 dict_tree=param, 7663 sections=[ 7664 "calculation", 7665 "calculations", 7666 calculation_operation.upper(), 7667 ], 7668 value={}, 7669 ) 7670 7671 # Operations for calculation 7672 if not operations: 7673 operations = param.get("calculation", {}).get("calculations", {}) 7674 7675 if operations: 7676 log.info(f"Calculations...") 7677 7678 # For each operations 7679 for operation_name in operations: 7680 operation_name = operation_name.upper() 7681 if operation_name not in [""]: 7682 if operation_name in operations_config: 7683 log.info(f"Calculation '{operation_name}'") 7684 operation = operations_config[operation_name] 7685 operation_type = operation.get("type", "sql") 7686 if operation_type == "python": 7687 self.calculation_process_function( 7688 operation=operation, operation_name=operation_name 7689 ) 7690 elif operation_type == "sql": 7691 self.calculation_process_sql( 7692 operation=operation, operation_name=operation_name 7693 ) 7694 else: 7695 log.error( 7696 f"Operations config: Type '{operation_type}' NOT available" 7697 ) 7698 raise ValueError( 7699 f"Operations config: Type '{operation_type}' NOT available" 7700 ) 7701 else: 7702 log.error( 7703 f"Operations config: Calculation '{operation_name}' NOT available" 7704 ) 7705 raise ValueError( 7706 f"Operations config: Calculation '{operation_name}' NOT available" 7707 ) 7708 7709 # Explode INFOS fields into table fields 7710 if self.get_explode_infos(): 7711 self.explode_infos( 7712 prefix=self.get_explode_infos_prefix(), 7713 fields=self.get_explode_infos_fields(), 7714 force=True, 7715 )
It takes a list of operations, and for each operation, it checks if it's a python or sql operation, and then calls the appropriate function
param json example: "calculation": { "NOMEN": { "options": { "hgvs_field": "hgvs" }, "middle" : null }
7717 def calculation_process_sql( 7718 self, operation: dict, operation_name: str = "unknown" 7719 ) -> None: 7720 """ 7721 The `calculation_process_sql` function takes in a mathematical operation as a string and 7722 performs the operation, updating the specified table with the result. 7723 7724 :param operation: The `operation` parameter is a dictionary that contains information about the 7725 mathematical operation to be performed. It includes the following keys: 7726 :type operation: dict 7727 :param operation_name: The `operation_name` parameter is a string that represents the name of 7728 the mathematical operation being performed. It is used for logging and error handling purposes, 7729 defaults to unknown 7730 :type operation_name: str (optional) 7731 """ 7732 7733 # table variants 7734 table_variants = self.get_table_variants(clause="alter") 7735 7736 # Operation infos 7737 operation_name = operation.get("name", "unknown") 7738 log.debug(f"process sql {operation_name}") 7739 output_column_name = operation.get("output_column_name", operation_name) 7740 output_column_type = operation.get("output_column_type", "String") 7741 prefix = operation.get("explode_infos_prefix", "") 7742 output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR") 7743 output_column_description = operation.get( 7744 "output_column_description", f"{operation_name} operation" 7745 ) 7746 operation_query = operation.get("operation_query", None) 7747 if isinstance(operation_query, list): 7748 operation_query = " ".join(operation_query) 7749 operation_info_fields = operation.get("info_fields", []) 7750 operation_info_fields_check = operation.get("info_fields_check", False) 7751 operation_info = operation.get("operation_info", True) 7752 7753 if operation_query: 7754 7755 # Info fields check 7756 operation_info_fields_check_result = True 7757 if operation_info_fields_check: 7758 header_infos = self.get_header().infos 7759 for info_field in operation_info_fields: 7760 operation_info_fields_check_result = ( 7761 operation_info_fields_check_result 7762 and info_field in header_infos 7763 ) 7764 7765 # If info fields available 7766 if operation_info_fields_check_result: 7767 7768 # Added_columns 7769 added_columns = [] 7770 7771 # Create VCF header field 7772 vcf_reader = self.get_header() 7773 vcf_reader.infos[output_column_name] = vcf.parser._Info( 7774 output_column_name, 7775 ".", 7776 output_column_type, 7777 output_column_description, 7778 "howard calculation", 7779 "0", 7780 self.code_type_map.get(output_column_type), 7781 ) 7782 7783 # Explode infos if needed 7784 log.debug(f"calculation_process_sql prefix {prefix}") 7785 added_columns += self.explode_infos( 7786 prefix=prefix, 7787 fields=[output_column_name] + operation_info_fields, 7788 force=True, 7789 ) 7790 7791 # Create column 7792 added_column = self.add_column( 7793 table_name=table_variants, 7794 column_name=prefix + output_column_name, 7795 column_type=output_column_type_sql, 7796 default_value="null", 7797 ) 7798 added_columns.append(added_column) 7799 7800 # Operation calculation 7801 try: 7802 7803 # Query to update calculation column 7804 sql_update = f""" 7805 UPDATE {table_variants} 7806 SET "{prefix}{output_column_name}" = ({operation_query}) 7807 """ 7808 self.conn.execute(sql_update) 7809 7810 # Add to INFO 7811 if operation_info: 7812 sql_update_info = f""" 7813 UPDATE {table_variants} 7814 SET "INFO" = 7815 concat( 7816 CASE 7817 WHEN "INFO" IS NOT NULL 7818 THEN concat("INFO", ';') 7819 ELSE '' 7820 END, 7821 '{output_column_name}=', 7822 "{prefix}{output_column_name}" 7823 ) 7824 WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('') 7825 """ 7826 self.conn.execute(sql_update_info) 7827 7828 except: 7829 log.error( 7830 f"Operations config: Calculation '{operation_name}' query failed" 7831 ) 7832 raise ValueError( 7833 f"Operations config: Calculation '{operation_name}' query failed" 7834 ) 7835 7836 # Remove added columns 7837 for added_column in added_columns: 7838 log.debug(f"added_column: {added_column}") 7839 self.drop_column(column=added_column) 7840 7841 else: 7842 log.error( 7843 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 7844 ) 7845 raise ValueError( 7846 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 7847 ) 7848 7849 else: 7850 log.error( 7851 f"Operations config: Calculation '{operation_name}' query NOT defined" 7852 ) 7853 raise ValueError( 7854 f"Operations config: Calculation '{operation_name}' query NOT defined" 7855 )
The calculation_process_sql function takes in a mathematical operation as a string and
performs the operation, updating the specified table with the result.
Parameters
- operation: The
operationparameter is a dictionary that contains information about the mathematical operation to be performed. It includes the following keys: - operation_name: The
operation_nameparameter is a string that represents the name of the mathematical operation being performed. It is used for logging and error handling purposes, defaults to unknown
7857 def calculation_process_function( 7858 self, operation: dict, operation_name: str = "unknown" 7859 ) -> None: 7860 """ 7861 The `calculation_process_function` takes in an operation dictionary and performs the specified 7862 function with the given parameters. 7863 7864 :param operation: The `operation` parameter is a dictionary that contains information about the 7865 operation to be performed. It has the following keys: 7866 :type operation: dict 7867 :param operation_name: The `operation_name` parameter is a string that represents the name of 7868 the operation being performed. It is used for logging purposes, defaults to unknown 7869 :type operation_name: str (optional) 7870 """ 7871 7872 operation_name = operation["name"] 7873 log.debug(f"process sql {operation_name}") 7874 function_name = operation["function_name"] 7875 function_params = operation["function_params"] 7876 getattr(self, function_name)(*function_params)
The calculation_process_function takes in an operation dictionary and performs the specified
function with the given parameters.
Parameters
- operation: The
operationparameter is a dictionary that contains information about the operation to be performed. It has the following keys: - operation_name: The
operation_nameparameter is a string that represents the name of the operation being performed. It is used for logging purposes, defaults to unknown
7878 def calculation_variant_id(self) -> None: 7879 """ 7880 The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and 7881 updates the INFO field of a variants table with the variant ID. 7882 """ 7883 7884 # variant_id annotation field 7885 variant_id_tag = self.get_variant_id_column() 7886 added_columns = [variant_id_tag] 7887 7888 # variant_id hgvs tags" 7889 vcf_infos_tags = { 7890 variant_id_tag: "howard variant ID annotation", 7891 } 7892 7893 # Variants table 7894 table_variants = self.get_table_variants() 7895 7896 # Header 7897 vcf_reader = self.get_header() 7898 7899 # Add variant_id to header 7900 vcf_reader.infos[variant_id_tag] = vcf.parser._Info( 7901 variant_id_tag, 7902 ".", 7903 "String", 7904 vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"), 7905 "howard calculation", 7906 "0", 7907 self.code_type_map.get("String"), 7908 ) 7909 7910 # Update 7911 sql_update = f""" 7912 UPDATE {table_variants} 7913 SET "INFO" = 7914 concat( 7915 CASE 7916 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 7917 THEN '' 7918 ELSE concat("INFO", ';') 7919 END, 7920 '{variant_id_tag}=', 7921 "{variant_id_tag}" 7922 ) 7923 """ 7924 self.conn.execute(sql_update) 7925 7926 # Remove added columns 7927 for added_column in added_columns: 7928 self.drop_column(column=added_column)
The function calculation_variant_id adds a variant ID annotation to a VCF file header and
updates the INFO field of a variants table with the variant ID.
7930 def calculation_extract_snpeff_hgvs( 7931 self, 7932 snpeff_hgvs: str = "snpeff_hgvs", 7933 snpeff_field: str = "ANN", 7934 ) -> None: 7935 """ 7936 The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff 7937 annotation field in a VCF file and adds them as a new column in the variants table. 7938 7939 :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs` 7940 function is used to specify the name of the column that will store the HGVS nomenclatures 7941 extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to 7942 snpeff_hgvs 7943 :type snpeff_hgvs: str (optional) 7944 :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs` 7945 function represents the field in the VCF file that contains SnpEff annotations. This field is 7946 used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults 7947 to ANN 7948 :type snpeff_field: str (optional) 7949 """ 7950 7951 # Snpeff hgvs tags 7952 vcf_infos_tags = { 7953 snpeff_hgvs: "HGVS nomenclatures from snpEff annotation", 7954 } 7955 7956 # Prefix 7957 prefix = self.get_explode_infos_prefix() 7958 if prefix: 7959 prefix = "INFO/" 7960 7961 # snpEff fields 7962 speff_ann_infos = prefix + snpeff_field 7963 speff_hgvs_infos = prefix + snpeff_hgvs 7964 7965 # Variants table 7966 table_variants = self.get_table_variants() 7967 7968 # Header 7969 vcf_reader = self.get_header() 7970 7971 # Add columns 7972 added_columns = [] 7973 7974 # Explode HGVS field in column 7975 added_columns += self.explode_infos(fields=[snpeff_field]) 7976 7977 if snpeff_field in vcf_reader.infos: 7978 7979 log.debug(vcf_reader.infos[snpeff_field]) 7980 7981 # Extract ANN header 7982 ann_description = vcf_reader.infos[snpeff_field].desc 7983 pattern = r"'(.+?)'" 7984 match = re.search(pattern, ann_description) 7985 if match: 7986 ann_header_match = match.group(1).split(" | ") 7987 ann_header_desc = {} 7988 for i in range(len(ann_header_match)): 7989 ann_header_info = "".join( 7990 char for char in ann_header_match[i] if char.isalnum() 7991 ) 7992 ann_header_desc[ann_header_info] = ann_header_match[i] 7993 if not ann_header_desc: 7994 raise ValueError("Invalid header description format") 7995 else: 7996 raise ValueError("Invalid header description format") 7997 7998 # Create variant id 7999 variant_id_column = self.get_variant_id_column() 8000 added_columns += [variant_id_column] 8001 8002 # Create dataframe 8003 dataframe_snpeff_hgvs = self.get_query_to_df( 8004 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8005 ) 8006 8007 # Create main NOMEN column 8008 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8009 speff_ann_infos 8010 ].apply( 8011 lambda x: extract_snpeff_hgvs( 8012 str(x), header=list(ann_header_desc.values()) 8013 ) 8014 ) 8015 8016 # Add snpeff_hgvs to header 8017 vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info( 8018 snpeff_hgvs, 8019 ".", 8020 "String", 8021 vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"), 8022 "howard calculation", 8023 "0", 8024 self.code_type_map.get("String"), 8025 ) 8026 8027 # Update 8028 sql_update = f""" 8029 UPDATE variants 8030 SET "INFO" = 8031 concat( 8032 CASE 8033 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8034 THEN '' 8035 ELSE concat("INFO", ';') 8036 END, 8037 CASE 8038 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8039 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8040 THEN concat( 8041 '{snpeff_hgvs}=', 8042 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8043 ) 8044 ELSE '' 8045 END 8046 ) 8047 FROM dataframe_snpeff_hgvs 8048 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8049 8050 """ 8051 self.conn.execute(sql_update) 8052 8053 # Delete dataframe 8054 del dataframe_snpeff_hgvs 8055 gc.collect() 8056 8057 else: 8058 8059 log.warning( 8060 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8061 ) 8062 8063 # Remove added columns 8064 for added_column in added_columns: 8065 self.drop_column(column=added_column)
The function calculation_extract_snpeff_hgvs extracts HGVS nomenclatures from the SnpEff
annotation field in a VCF file and adds them as a new column in the variants table.
Parameters
- snpeff_hgvs: The
snpeff_hgvsparameter in thecalculation_extract_snpeff_hgvsfunction is used to specify the name of the column that will store the HGVS nomenclatures extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to snpeff_hgvs - snpeff_field: The
snpeff_fieldparameter in thecalculation_extract_snpeff_hgvsfunction represents the field in the VCF file that contains SnpEff annotations. This field is used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults to ANN
8067 def calculation_snpeff_ann_explode( 8068 self, 8069 uniquify: bool = True, 8070 output_format: str = "fields", 8071 output_prefix: str = "snpeff_", 8072 snpeff_field: str = "ANN", 8073 ) -> None: 8074 """ 8075 The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by 8076 exploding the HGVS field and updating variant information accordingly. 8077 8078 :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a 8079 boolean flag that determines whether the output should be uniquified or not. When set to `True`, 8080 it indicates that the output should be unique, meaning that duplicate entries should be removed, 8081 defaults to True 8082 :type uniquify: bool (optional) 8083 :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode` 8084 function specifies the format in which the output annotations will be generated. It has a 8085 default value of "fields". You can also set it to "JSON" to output the annotations in JSON 8086 format, defaults to fields 8087 :type output_format: str (optional) 8088 :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode` 8089 method is used to specify the prefix that will be added to the output annotations generated 8090 during the calculation process. This prefix helps to differentiate the newly added annotations 8091 from existing ones in the output data. By default, the, defaults to ANN_ 8092 :type output_prefix: str (optional) 8093 :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode` 8094 function is used to specify the field in the VCF file that contains SnpEff annotations. This 8095 field will be processed to explode the HGVS annotations and update the variant information 8096 accordingly, defaults to ANN 8097 :type snpeff_field: str (optional) 8098 """ 8099 8100 # SnpEff annotation field 8101 snpeff_hgvs = "snpeff_ann_explode" 8102 8103 # Snpeff hgvs tags 8104 vcf_infos_tags = { 8105 snpeff_hgvs: "Explode snpEff annotations", 8106 } 8107 8108 # Prefix 8109 prefix = self.get_explode_infos_prefix() 8110 if prefix: 8111 prefix = "INFO/" 8112 8113 # snpEff fields 8114 speff_ann_infos = prefix + snpeff_field 8115 speff_hgvs_infos = prefix + snpeff_hgvs 8116 8117 # Variants table 8118 table_variants = self.get_table_variants() 8119 8120 # Header 8121 vcf_reader = self.get_header() 8122 8123 # Add columns 8124 added_columns = [] 8125 8126 # Explode HGVS field in column 8127 added_columns += self.explode_infos(fields=[snpeff_field]) 8128 log.debug(f"snpeff_field={snpeff_field}") 8129 log.debug(f"added_columns={added_columns}") 8130 8131 if snpeff_field in vcf_reader.infos: 8132 8133 # Extract ANN header 8134 ann_description = vcf_reader.infos[snpeff_field].desc 8135 pattern = r"'(.+?)'" 8136 match = re.search(pattern, ann_description) 8137 if match: 8138 ann_header_match = match.group(1).split(" | ") 8139 ann_header = [] 8140 ann_header_desc = {} 8141 for i in range(len(ann_header_match)): 8142 ann_header_info = "".join( 8143 char for char in ann_header_match[i] if char.isalnum() 8144 ) 8145 ann_header.append(ann_header_info) 8146 ann_header_desc[ann_header_info] = ann_header_match[i] 8147 if not ann_header_desc: 8148 raise ValueError("Invalid header description format") 8149 else: 8150 raise ValueError("Invalid header description format") 8151 8152 # Create variant id 8153 variant_id_column = self.get_variant_id_column() 8154 added_columns += [variant_id_column] 8155 8156 # Create dataframe 8157 dataframe_snpeff_hgvs = self.get_query_to_df( 8158 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8159 ) 8160 8161 # Create snpEff columns 8162 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8163 speff_ann_infos 8164 ].apply( 8165 lambda x: explode_snpeff_ann( 8166 str(x), 8167 uniquify=uniquify, 8168 output_format=output_format, 8169 prefix=output_prefix, 8170 header=list(ann_header_desc.values()), 8171 ) 8172 ) 8173 8174 # Header 8175 ann_annotations_prefix = "" 8176 if output_format.upper() in ["JSON"]: 8177 ann_annotations_prefix = f"{output_prefix}=" 8178 vcf_reader.infos[output_prefix] = vcf.parser._Info( 8179 output_prefix, 8180 ".", 8181 "String", 8182 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8183 + " - JSON format", 8184 "howard calculation", 8185 "0", 8186 self.code_type_map.get("String"), 8187 ) 8188 else: 8189 for ann_annotation in ann_header: 8190 ann_annotation_id = f"{output_prefix}{ann_annotation}" 8191 vcf_reader.infos[ann_annotation_id] = vcf.parser._Info( 8192 ann_annotation_id, 8193 ".", 8194 "String", 8195 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8196 + f" - '{ann_header_desc[ann_annotation]}' annotation", 8197 "howard calculation", 8198 "0", 8199 self.code_type_map.get("String"), 8200 ) 8201 8202 # Update 8203 sql_update = f""" 8204 UPDATE variants 8205 SET "INFO" = 8206 concat( 8207 CASE 8208 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8209 THEN '' 8210 ELSE concat("INFO", ';') 8211 END, 8212 CASE 8213 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8214 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8215 THEN concat( 8216 '{ann_annotations_prefix}', 8217 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8218 ) 8219 ELSE '' 8220 END 8221 ) 8222 FROM dataframe_snpeff_hgvs 8223 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8224 8225 """ 8226 self.conn.execute(sql_update) 8227 8228 # Delete dataframe 8229 del dataframe_snpeff_hgvs 8230 gc.collect() 8231 8232 else: 8233 8234 log.warning( 8235 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8236 ) 8237 8238 # Remove added columns 8239 for added_column in added_columns: 8240 self.drop_column(column=added_column)
The calculation_snpeff_ann_explode function processes SnpEff annotations in a VCF file by
exploding the HGVS field and updating variant information accordingly.
Parameters
- uniquify: The
uniquifyparameter in thecalculation_snpeff_ann_explodemethod is a boolean flag that determines whether the output should be uniquified or not. When set toTrue, it indicates that the output should be unique, meaning that duplicate entries should be removed, defaults to True - output_format: The
output_formatparameter in thecalculation_snpeff_ann_explodefunction specifies the format in which the output annotations will be generated. It has a default value of "fields". You can also set it to "JSON" to output the annotations in JSON format, defaults to fields - output_prefix: The
output_prefixparameter in thecalculation_snpeff_ann_explodemethod is used to specify the prefix that will be added to the output annotations generated during the calculation process. This prefix helps to differentiate the newly added annotations from existing ones in the output data. By default, the, defaults to ANN_ - snpeff_field: The
snpeff_fieldparameter in thecalculation_snpeff_ann_explodefunction is used to specify the field in the VCF file that contains SnpEff annotations. This field will be processed to explode the HGVS annotations and update the variant information accordingly, defaults to ANN
8242 def calculation_extract_nomen(self) -> None: 8243 """ 8244 This function extracts the HGVS nomenclature from the calculation/identification of NOMEN. 8245 """ 8246 8247 # NOMEN field 8248 field_nomen_dict = "NOMEN_DICT" 8249 8250 # NOMEN structure 8251 nomen_dict = { 8252 "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)", 8253 "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)", 8254 "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)", 8255 "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant", 8256 "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)", 8257 "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)", 8258 "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)", 8259 "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)", 8260 "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)", 8261 "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)", 8262 } 8263 8264 # Param 8265 param = self.get_param() 8266 8267 # Prefix 8268 prefix = self.get_explode_infos_prefix() 8269 8270 # Header 8271 vcf_reader = self.get_header() 8272 8273 # Get HGVS field 8274 hgvs_field = ( 8275 param.get("calculation", {}) 8276 .get("calculations", {}) 8277 .get("NOMEN", {}) 8278 .get("options", {}) 8279 .get("hgvs_field", "hgvs") 8280 ) 8281 8282 # Get transcripts 8283 transcripts_file = ( 8284 param.get("calculation", {}) 8285 .get("calculations", {}) 8286 .get("NOMEN", {}) 8287 .get("options", {}) 8288 .get("transcripts", None) 8289 ) 8290 transcripts_file = full_path(transcripts_file) 8291 transcripts = [] 8292 if transcripts_file: 8293 if os.path.exists(transcripts_file): 8294 transcripts_dataframe = transcripts_file_to_df(transcripts_file) 8295 transcripts = transcripts_dataframe.iloc[:, 0].tolist() 8296 else: 8297 log.error(f"Transcript file '{transcripts_file}' does NOT exist") 8298 raise ValueError(f"Transcript file '{transcripts_file}' does NOT exist") 8299 8300 # Added columns 8301 added_columns = [] 8302 8303 # Explode HGVS field in column 8304 added_columns += self.explode_infos(fields=[hgvs_field]) 8305 8306 # extra infos 8307 extra_infos = self.get_extra_infos() 8308 extra_field = prefix + hgvs_field 8309 8310 if extra_field in extra_infos: 8311 8312 # Create dataframe 8313 dataframe_hgvs = self.get_query_to_df( 8314 f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" FROM variants """ 8315 ) 8316 8317 # Create main NOMEN column 8318 dataframe_hgvs[field_nomen_dict] = dataframe_hgvs[extra_field].apply( 8319 lambda x: find_nomen(str(x), transcripts=transcripts) 8320 ) 8321 8322 # Explode NOMEN Structure and create SQL set for update 8323 sql_nomen_fields = [] 8324 for nomen_field in nomen_dict: 8325 8326 # Explode each field into a column 8327 dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply( 8328 lambda x: dict(x).get(nomen_field, "") 8329 ) 8330 8331 # Create VCF header field 8332 vcf_reader.infos[nomen_field] = vcf.parser._Info( 8333 nomen_field, 8334 ".", 8335 "String", 8336 nomen_dict.get(nomen_field, "howard calculation NOMEN"), 8337 "howard calculation", 8338 "0", 8339 self.code_type_map.get("String"), 8340 ) 8341 sql_nomen_fields.append( 8342 f""" 8343 CASE 8344 WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('') 8345 THEN concat( 8346 ';{nomen_field}=', 8347 dataframe_hgvs."{nomen_field}" 8348 ) 8349 ELSE '' 8350 END 8351 """ 8352 ) 8353 8354 # SQL set for update 8355 sql_nomen_fields_set = ", ".join(sql_nomen_fields) 8356 8357 # Update 8358 sql_update = f""" 8359 UPDATE variants 8360 SET "INFO" = 8361 concat( 8362 CASE 8363 WHEN "INFO" IS NULL 8364 THEN '' 8365 ELSE "INFO" 8366 END, 8367 {sql_nomen_fields_set} 8368 ) 8369 FROM dataframe_hgvs 8370 WHERE variants."#CHROM" = dataframe_hgvs."#CHROM" 8371 AND variants."POS" = dataframe_hgvs."POS" 8372 AND variants."REF" = dataframe_hgvs."REF" 8373 AND variants."ALT" = dataframe_hgvs."ALT" 8374 """ 8375 self.conn.execute(sql_update) 8376 8377 # Delete dataframe 8378 del dataframe_hgvs 8379 gc.collect() 8380 8381 # Remove added columns 8382 for added_column in added_columns: 8383 self.drop_column(column=added_column)
This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
8385 def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None: 8386 """ 8387 The function `calculation_find_by_pipeline` performs a calculation to find the number of 8388 pipeline/sample for a variant and updates the variant information in a VCF file. 8389 8390 :param tag: The `tag` parameter is a string that represents the annotation field for the 8391 "findbypipeline" information in the VCF file. It is used to create the annotation field in the 8392 VCF header and to update the corresponding field in the variants table, defaults to 8393 findbypipeline 8394 :type tag: str (optional) 8395 """ 8396 8397 # if FORMAT and samples 8398 if ( 8399 "FORMAT" in self.get_header_columns_as_list() 8400 and self.get_header_sample_list() 8401 ): 8402 8403 # findbypipeline annotation field 8404 findbypipeline_tag = tag 8405 8406 # VCF infos tags 8407 vcf_infos_tags = { 8408 findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})", 8409 } 8410 8411 # Prefix 8412 prefix = self.get_explode_infos_prefix() 8413 8414 # Field 8415 findbypipeline_infos = prefix + findbypipeline_tag 8416 8417 # Variants table 8418 table_variants = self.get_table_variants() 8419 8420 # Header 8421 vcf_reader = self.get_header() 8422 8423 # Create variant id 8424 variant_id_column = self.get_variant_id_column() 8425 added_columns = [variant_id_column] 8426 8427 # variant_id, FORMAT and samples 8428 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8429 self.get_header_sample_list() 8430 ) 8431 8432 # Create dataframe 8433 dataframe_findbypipeline = self.get_query_to_df( 8434 f""" SELECT {samples_fields} FROM {table_variants} """ 8435 ) 8436 8437 # Create findbypipeline column 8438 dataframe_findbypipeline[findbypipeline_infos] = ( 8439 dataframe_findbypipeline.apply( 8440 lambda row: findbypipeline( 8441 row, samples=self.get_header_sample_list() 8442 ), 8443 axis=1, 8444 ) 8445 ) 8446 8447 # Add snpeff_hgvs to header 8448 vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info( 8449 findbypipeline_tag, 8450 ".", 8451 "String", 8452 vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"), 8453 "howard calculation", 8454 "0", 8455 self.code_type_map.get("String"), 8456 ) 8457 8458 # Update 8459 sql_update = f""" 8460 UPDATE variants 8461 SET "INFO" = 8462 concat( 8463 CASE 8464 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8465 THEN '' 8466 ELSE concat("INFO", ';') 8467 END, 8468 CASE 8469 WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.') 8470 AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL 8471 THEN concat( 8472 '{findbypipeline_tag}=', 8473 dataframe_findbypipeline."{findbypipeline_infos}" 8474 ) 8475 ELSE '' 8476 END 8477 ) 8478 FROM dataframe_findbypipeline 8479 WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}" 8480 """ 8481 self.conn.execute(sql_update) 8482 8483 # Remove added columns 8484 for added_column in added_columns: 8485 self.drop_column(column=added_column) 8486 8487 # Delete dataframe 8488 del dataframe_findbypipeline 8489 gc.collect()
The function calculation_find_by_pipeline performs a calculation to find the number of
pipeline/sample for a variant and updates the variant information in a VCF file.
Parameters
- tag: The
tagparameter is a string that represents the annotation field for the "findbypipeline" information in the VCF file. It is used to create the annotation field in the VCF header and to update the corresponding field in the variants table, defaults to findbypipeline
8491 def calculation_genotype_concordance(self) -> None: 8492 """ 8493 The function `calculation_genotype_concordance` calculates the genotype concordance for 8494 multi-caller VCF files and updates the variant information in the database. 8495 """ 8496 8497 # if FORMAT and samples 8498 if ( 8499 "FORMAT" in self.get_header_columns_as_list() 8500 and self.get_header_sample_list() 8501 ): 8502 8503 # genotypeconcordance annotation field 8504 genotypeconcordance_tag = "genotypeconcordance" 8505 8506 # VCF infos tags 8507 vcf_infos_tags = { 8508 genotypeconcordance_tag: "Concordance of genotype for multi caller VCF", 8509 } 8510 8511 # Prefix 8512 prefix = self.get_explode_infos_prefix() 8513 8514 # Field 8515 genotypeconcordance_infos = prefix + genotypeconcordance_tag 8516 8517 # Variants table 8518 table_variants = self.get_table_variants() 8519 8520 # Header 8521 vcf_reader = self.get_header() 8522 8523 # Create variant id 8524 variant_id_column = self.get_variant_id_column() 8525 added_columns = [variant_id_column] 8526 8527 # variant_id, FORMAT and samples 8528 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8529 self.get_header_sample_list() 8530 ) 8531 8532 # Create dataframe 8533 dataframe_genotypeconcordance = self.get_query_to_df( 8534 f""" SELECT {samples_fields} FROM {table_variants} """ 8535 ) 8536 8537 # Create genotypeconcordance column 8538 dataframe_genotypeconcordance[genotypeconcordance_infos] = ( 8539 dataframe_genotypeconcordance.apply( 8540 lambda row: genotypeconcordance( 8541 row, samples=self.get_header_sample_list() 8542 ), 8543 axis=1, 8544 ) 8545 ) 8546 8547 # Add genotypeconcordance to header 8548 vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info( 8549 genotypeconcordance_tag, 8550 ".", 8551 "String", 8552 vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"), 8553 "howard calculation", 8554 "0", 8555 self.code_type_map.get("String"), 8556 ) 8557 8558 # Update 8559 sql_update = f""" 8560 UPDATE variants 8561 SET "INFO" = 8562 concat( 8563 CASE 8564 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8565 THEN '' 8566 ELSE concat("INFO", ';') 8567 END, 8568 CASE 8569 WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.') 8570 AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL 8571 THEN concat( 8572 '{genotypeconcordance_tag}=', 8573 dataframe_genotypeconcordance."{genotypeconcordance_infos}" 8574 ) 8575 ELSE '' 8576 END 8577 ) 8578 FROM dataframe_genotypeconcordance 8579 WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}" 8580 """ 8581 self.conn.execute(sql_update) 8582 8583 # Remove added columns 8584 for added_column in added_columns: 8585 self.drop_column(column=added_column) 8586 8587 # Delete dataframe 8588 del dataframe_genotypeconcordance 8589 gc.collect()
The function calculation_genotype_concordance calculates the genotype concordance for
multi-caller VCF files and updates the variant information in the database.
8591 def calculation_barcode(self, tag: str = "barcode") -> None: 8592 """ 8593 The `calculation_barcode` function calculates barcode values for variants in a VCF file and 8594 updates the INFO field in the file with the calculated barcode values. 8595 8596 :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag 8597 name that will be used for the barcode calculation in the VCF file. If no tag name is provided, 8598 the default tag name is set to "barcode", defaults to barcode 8599 :type tag: str (optional) 8600 """ 8601 8602 # if FORMAT and samples 8603 if ( 8604 "FORMAT" in self.get_header_columns_as_list() 8605 and self.get_header_sample_list() 8606 ): 8607 8608 # barcode annotation field 8609 if not tag: 8610 tag = "barcode" 8611 8612 # VCF infos tags 8613 vcf_infos_tags = { 8614 tag: "barcode calculation (VaRank)", 8615 } 8616 8617 # Prefix 8618 prefix = self.get_explode_infos_prefix() 8619 8620 # Field 8621 barcode_infos = prefix + tag 8622 8623 # Variants table 8624 table_variants = self.get_table_variants() 8625 8626 # Header 8627 vcf_reader = self.get_header() 8628 8629 # Create variant id 8630 variant_id_column = self.get_variant_id_column() 8631 added_columns = [variant_id_column] 8632 8633 # variant_id, FORMAT and samples 8634 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8635 self.get_header_sample_list() 8636 ) 8637 8638 # Create dataframe 8639 dataframe_barcode = self.get_query_to_df( 8640 f""" SELECT {samples_fields} FROM {table_variants} """ 8641 ) 8642 8643 # Create barcode column 8644 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 8645 lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1 8646 ) 8647 8648 # Add barcode to header 8649 vcf_reader.infos[tag] = vcf.parser._Info( 8650 tag, 8651 ".", 8652 "String", 8653 vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)), 8654 "howard calculation", 8655 "0", 8656 self.code_type_map.get("String"), 8657 ) 8658 8659 # Update 8660 sql_update = f""" 8661 UPDATE {table_variants} 8662 SET "INFO" = 8663 concat( 8664 CASE 8665 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8666 THEN '' 8667 ELSE concat("INFO", ';') 8668 END, 8669 CASE 8670 WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.') 8671 AND dataframe_barcode."{barcode_infos}" NOT NULL 8672 THEN concat( 8673 '{tag}=', 8674 dataframe_barcode."{barcode_infos}" 8675 ) 8676 ELSE '' 8677 END 8678 ) 8679 FROM dataframe_barcode 8680 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 8681 """ 8682 self.conn.execute(sql_update) 8683 8684 # Remove added columns 8685 for added_column in added_columns: 8686 self.drop_column(column=added_column) 8687 8688 # Delete dataframe 8689 del dataframe_barcode 8690 gc.collect()
The calculation_barcode function calculates barcode values for variants in a VCF file and
updates the INFO field in the file with the calculated barcode values.
Parameters
- tag: The
tagparameter in thecalculation_barcodefunction is used to specify the tag name that will be used for the barcode calculation in the VCF file. If no tag name is provided, the default tag name is set to "barcode", defaults to barcode
8692 def calculation_barcode_family(self, tag: str = "BCF") -> None: 8693 """ 8694 The `calculation_barcode_family` function calculates barcode values for variants in a VCF file 8695 and updates the INFO field in the file with the calculated barcode values. 8696 8697 :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify 8698 the barcode tag that will be added to the VCF file during the calculation process. If no value 8699 is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF 8700 :type tag: str (optional) 8701 """ 8702 8703 # if FORMAT and samples 8704 if ( 8705 "FORMAT" in self.get_header_columns_as_list() 8706 and self.get_header_sample_list() 8707 ): 8708 8709 # barcode annotation field 8710 if not tag: 8711 tag = "BCF" 8712 8713 # VCF infos tags 8714 vcf_infos_tags = { 8715 tag: "barcode family calculation", 8716 f"{tag}S": "barcode family samples", 8717 } 8718 8719 # Param 8720 param = self.get_param() 8721 log.debug(f"param={param}") 8722 8723 # Prefix 8724 prefix = self.get_explode_infos_prefix() 8725 8726 # PED param 8727 ped = ( 8728 param.get("calculation", {}) 8729 .get("calculations", {}) 8730 .get("BARCODEFAMILY", {}) 8731 .get("family_pedigree", None) 8732 ) 8733 log.debug(f"ped={ped}") 8734 8735 # Load PED 8736 if ped: 8737 8738 # Pedigree is a file 8739 if isinstance(ped, str) and os.path.exists(full_path(ped)): 8740 log.debug("Pedigree is file") 8741 with open(full_path(ped)) as ped: 8742 ped = json.load(ped) 8743 8744 # Pedigree is a string 8745 elif isinstance(ped, str): 8746 log.debug("Pedigree is str") 8747 try: 8748 ped = json.loads(ped) 8749 log.debug("Pedigree is json str") 8750 except ValueError as e: 8751 ped_samples = ped.split(",") 8752 ped = {} 8753 for ped_sample in ped_samples: 8754 ped[ped_sample] = ped_sample 8755 8756 # Pedigree is a dict 8757 elif isinstance(ped, dict): 8758 log.debug("Pedigree is dict") 8759 8760 # Pedigree is not well formatted 8761 else: 8762 msg_error = "Pedigree not well formatted" 8763 log.error(msg_error) 8764 raise ValueError(msg_error) 8765 8766 # Construct list 8767 ped_samples = list(ped.values()) 8768 8769 else: 8770 log.debug("Pedigree not defined. Take all samples") 8771 ped_samples = self.get_header_sample_list() 8772 ped = {} 8773 for ped_sample in ped_samples: 8774 ped[ped_sample] = ped_sample 8775 8776 # Check pedigree 8777 if not ped or len(ped) == 0: 8778 msg_error = f"Error in pedigree: samples {ped_samples}" 8779 log.error(msg_error) 8780 raise ValueError(msg_error) 8781 8782 # Log 8783 log.info( 8784 "Calculation 'BARCODEFAMILY' - Samples: " 8785 + ", ".join([f"{member}='{ped[member]}'" for member in ped]) 8786 ) 8787 log.debug(f"ped_samples={ped_samples}") 8788 8789 # Field 8790 barcode_infos = prefix + tag 8791 8792 # Variants table 8793 table_variants = self.get_table_variants() 8794 8795 # Header 8796 vcf_reader = self.get_header() 8797 8798 # Create variant id 8799 variant_id_column = self.get_variant_id_column() 8800 added_columns = [variant_id_column] 8801 8802 # variant_id, FORMAT and samples 8803 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8804 ped_samples 8805 ) 8806 8807 # Create dataframe 8808 dataframe_barcode = self.get_query_to_df( 8809 f""" SELECT {samples_fields} FROM {table_variants} """ 8810 ) 8811 8812 # Create barcode column 8813 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 8814 lambda row: barcode(row, samples=ped_samples), axis=1 8815 ) 8816 8817 # Add barcode family to header 8818 # Add vaf_normalization to header 8819 vcf_reader.formats[tag] = vcf.parser._Format( 8820 id=tag, 8821 num=".", 8822 type="String", 8823 desc=vcf_infos_tags.get(tag, "barcode family calculation"), 8824 type_code=self.code_type_map.get("String"), 8825 ) 8826 vcf_reader.formats[f"{tag}S"] = vcf.parser._Format( 8827 id=f"{tag}S", 8828 num=".", 8829 type="String", 8830 desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"), 8831 type_code=self.code_type_map.get("String"), 8832 ) 8833 8834 # Update 8835 # for sample in ped_samples: 8836 sql_update_set = [] 8837 for sample in self.get_header_sample_list() + ["FORMAT"]: 8838 if sample in ped_samples: 8839 value = f'dataframe_barcode."{barcode_infos}"' 8840 value_samples = "'" + ",".join(ped_samples) + "'" 8841 elif sample == "FORMAT": 8842 value = f"'{tag}'" 8843 value_samples = f"'{tag}S'" 8844 else: 8845 value = "'.'" 8846 value_samples = "'.'" 8847 format_regex = r"[a-zA-Z0-9\s]" 8848 sql_update_set.append( 8849 f""" 8850 "{sample}" = 8851 concat( 8852 CASE 8853 WHEN {table_variants}."{sample}" = './.' 8854 THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g')) 8855 ELSE {table_variants}."{sample}" 8856 END, 8857 ':', 8858 {value}, 8859 ':', 8860 {value_samples} 8861 ) 8862 """ 8863 ) 8864 8865 sql_update_set_join = ", ".join(sql_update_set) 8866 sql_update = f""" 8867 UPDATE {table_variants} 8868 SET {sql_update_set_join} 8869 FROM dataframe_barcode 8870 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 8871 """ 8872 self.conn.execute(sql_update) 8873 8874 # Remove added columns 8875 for added_column in added_columns: 8876 self.drop_column(column=added_column) 8877 8878 # Delete dataframe 8879 del dataframe_barcode 8880 gc.collect()
The calculation_barcode_family function calculates barcode values for variants in a VCF file
and updates the INFO field in the file with the calculated barcode values.
Parameters
- tag: The
tagparameter in thecalculation_barcode_familyfunction is used to specify the barcode tag that will be added to the VCF file during the calculation process. If no value is provided for thetagparameter, the default value used is "BCF", defaults to BCF
8882 def calculation_trio(self) -> None: 8883 """ 8884 The `calculation_trio` function performs trio calculations on a VCF file by adding trio 8885 information to the INFO field of each variant. 8886 """ 8887 8888 # if FORMAT and samples 8889 if ( 8890 "FORMAT" in self.get_header_columns_as_list() 8891 and self.get_header_sample_list() 8892 ): 8893 8894 # trio annotation field 8895 trio_tag = "trio" 8896 8897 # VCF infos tags 8898 vcf_infos_tags = { 8899 "trio": "trio calculation", 8900 } 8901 8902 # Param 8903 param = self.get_param() 8904 8905 # Prefix 8906 prefix = self.get_explode_infos_prefix() 8907 8908 # Trio param 8909 trio_ped = ( 8910 param.get("calculation", {}) 8911 .get("calculations", {}) 8912 .get("TRIO", {}) 8913 .get("trio_pedigree", None) 8914 ) 8915 8916 # Load trio 8917 if trio_ped: 8918 8919 # Trio pedigree is a file 8920 if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)): 8921 log.debug("TRIO pedigree is file") 8922 with open(full_path(trio_ped)) as trio_ped: 8923 trio_ped = json.load(trio_ped) 8924 8925 # Trio pedigree is a string 8926 elif isinstance(trio_ped, str): 8927 log.debug("TRIO pedigree is str") 8928 try: 8929 trio_ped = json.loads(trio_ped) 8930 log.debug("TRIO pedigree is json str") 8931 except ValueError as e: 8932 trio_samples = trio_ped.split(",") 8933 if len(trio_samples) == 3: 8934 trio_ped = { 8935 "father": trio_samples[0], 8936 "mother": trio_samples[1], 8937 "child": trio_samples[2], 8938 } 8939 log.debug("TRIO pedigree is list str") 8940 else: 8941 msg_error = "TRIO pedigree not well formatted" 8942 log.error(msg_error) 8943 raise ValueError(msg_error) 8944 8945 # Trio pedigree is a dict 8946 elif isinstance(trio_ped, dict): 8947 log.debug("TRIO pedigree is dict") 8948 8949 # Trio pedigree is not well formatted 8950 else: 8951 msg_error = "TRIO pedigree not well formatted" 8952 log.error(msg_error) 8953 raise ValueError(msg_error) 8954 8955 # Construct trio list 8956 trio_samples = [ 8957 trio_ped.get("father", ""), 8958 trio_ped.get("mother", ""), 8959 trio_ped.get("child", ""), 8960 ] 8961 8962 else: 8963 log.debug("TRIO pedigree not defined. Take the first 3 samples") 8964 samples_list = self.get_header_sample_list() 8965 if len(samples_list) >= 3: 8966 trio_samples = self.get_header_sample_list()[0:3] 8967 trio_ped = { 8968 "father": trio_samples[0], 8969 "mother": trio_samples[1], 8970 "child": trio_samples[2], 8971 } 8972 else: 8973 msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}" 8974 log.error(msg_error) 8975 raise ValueError(msg_error) 8976 8977 # Check trio pedigree 8978 if not trio_ped or len(trio_ped) != 3: 8979 msg_error = f"Error in TRIO pedigree: {trio_ped}" 8980 log.error(msg_error) 8981 raise ValueError(msg_error) 8982 8983 # Log 8984 log.info( 8985 f"Calculation 'TRIO' - Samples: " 8986 + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped]) 8987 ) 8988 8989 # Field 8990 trio_infos = prefix + trio_tag 8991 8992 # Variants table 8993 table_variants = self.get_table_variants() 8994 8995 # Header 8996 vcf_reader = self.get_header() 8997 8998 # Create variant id 8999 variant_id_column = self.get_variant_id_column() 9000 added_columns = [variant_id_column] 9001 9002 # variant_id, FORMAT and samples 9003 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9004 self.get_header_sample_list() 9005 ) 9006 9007 # Create dataframe 9008 dataframe_trio = self.get_query_to_df( 9009 f""" SELECT {samples_fields} FROM {table_variants} """ 9010 ) 9011 9012 # Create trio column 9013 dataframe_trio[trio_infos] = dataframe_trio.apply( 9014 lambda row: trio(row, samples=trio_samples), axis=1 9015 ) 9016 9017 # Add trio to header 9018 vcf_reader.infos[trio_tag] = vcf.parser._Info( 9019 trio_tag, 9020 ".", 9021 "String", 9022 vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"), 9023 "howard calculation", 9024 "0", 9025 self.code_type_map.get("String"), 9026 ) 9027 9028 # Update 9029 sql_update = f""" 9030 UPDATE {table_variants} 9031 SET "INFO" = 9032 concat( 9033 CASE 9034 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9035 THEN '' 9036 ELSE concat("INFO", ';') 9037 END, 9038 CASE 9039 WHEN dataframe_trio."{trio_infos}" NOT IN ('','.') 9040 AND dataframe_trio."{trio_infos}" NOT NULL 9041 THEN concat( 9042 '{trio_tag}=', 9043 dataframe_trio."{trio_infos}" 9044 ) 9045 ELSE '' 9046 END 9047 ) 9048 FROM dataframe_trio 9049 WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}" 9050 """ 9051 self.conn.execute(sql_update) 9052 9053 # Remove added columns 9054 for added_column in added_columns: 9055 self.drop_column(column=added_column) 9056 9057 # Delete dataframe 9058 del dataframe_trio 9059 gc.collect()
The calculation_trio function performs trio calculations on a VCF file by adding trio
information to the INFO field of each variant.
9061 def calculation_vaf_normalization(self) -> None: 9062 """ 9063 The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency) 9064 normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly. 9065 :return: The function does not return anything. 9066 """ 9067 9068 # if FORMAT and samples 9069 if ( 9070 "FORMAT" in self.get_header_columns_as_list() 9071 and self.get_header_sample_list() 9072 ): 9073 9074 # vaf_normalization annotation field 9075 vaf_normalization_tag = "VAF" 9076 9077 # VCF infos tags 9078 vcf_infos_tags = { 9079 "VAF": "VAF Variant Frequency", 9080 } 9081 9082 # Prefix 9083 prefix = self.get_explode_infos_prefix() 9084 9085 # Variants table 9086 table_variants = self.get_table_variants() 9087 9088 # Header 9089 vcf_reader = self.get_header() 9090 9091 # Do not calculate if VAF already exists 9092 if "VAF" in vcf_reader.formats: 9093 log.debug("VAF already on genotypes") 9094 return 9095 9096 # Create variant id 9097 variant_id_column = self.get_variant_id_column() 9098 added_columns = [variant_id_column] 9099 9100 # variant_id, FORMAT and samples 9101 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9102 f""" "{sample}" """ for sample in self.get_header_sample_list() 9103 ) 9104 9105 # Create dataframe 9106 query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """ 9107 log.debug(f"query={query}") 9108 dataframe_vaf_normalization = self.get_query_to_df(query=query) 9109 9110 vaf_normalization_set = [] 9111 9112 # for each sample vaf_normalization 9113 for sample in self.get_header_sample_list(): 9114 dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply( 9115 lambda row: vaf_normalization(row, sample=sample), axis=1 9116 ) 9117 vaf_normalization_set.append( 9118 f""" "{sample}" = dataframe_vaf_normalization."{sample}" """ 9119 ) 9120 9121 # Add VAF to FORMAT 9122 dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[ 9123 "FORMAT" 9124 ].apply(lambda x: str(x) + ":VAF") 9125 vaf_normalization_set.append( 9126 f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """ 9127 ) 9128 9129 # Add vaf_normalization to header 9130 vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format( 9131 id=vaf_normalization_tag, 9132 num="1", 9133 type="Float", 9134 desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"), 9135 type_code=self.code_type_map.get("Float"), 9136 ) 9137 9138 # Create fields to add in INFO 9139 sql_vaf_normalization_set = " , ".join(vaf_normalization_set) 9140 9141 # Update 9142 sql_update = f""" 9143 UPDATE {table_variants} 9144 SET {sql_vaf_normalization_set} 9145 FROM dataframe_vaf_normalization 9146 WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}" 9147 9148 """ 9149 self.conn.execute(sql_update) 9150 9151 # Remove added columns 9152 for added_column in added_columns: 9153 self.drop_column(column=added_column) 9154 9155 # Delete dataframe 9156 del dataframe_vaf_normalization 9157 gc.collect()
The calculation_vaf_normalization function calculates the VAF (Variant Allele Frequency)
normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
Returns
The function does not return anything.
9159 def calculation_genotype_stats(self, info: str = "VAF") -> None: 9160 """ 9161 The `calculation_genotype_stats` function calculates genotype statistics for a given information 9162 field in a VCF file and updates the INFO column of the variants table with the calculated 9163 statistics. 9164 9165 :param info: The `info` parameter is a string that represents the type of information for which 9166 genotype statistics are calculated. It is used to generate various VCF info tags for the 9167 statistics, such as the number of occurrences, the list of values, the minimum value, the 9168 maximum value, the mean, the median, defaults to VAF 9169 :type info: str (optional) 9170 """ 9171 9172 # if FORMAT and samples 9173 if ( 9174 "FORMAT" in self.get_header_columns_as_list() 9175 and self.get_header_sample_list() 9176 ): 9177 9178 # vaf_stats annotation field 9179 vaf_stats_tag = info + "_stats" 9180 9181 # VCF infos tags 9182 vcf_infos_tags = { 9183 info + "_stats_nb": f"genotype {info} Statistics - number of {info}", 9184 info + "_stats_list": f"genotype {info} Statistics - list of {info}", 9185 info + "_stats_min": f"genotype {info} Statistics - min {info}", 9186 info + "_stats_max": f"genotype {info} Statistics - max {info}", 9187 info + "_stats_mean": f"genotype {info} Statistics - mean {info}", 9188 info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}", 9189 info 9190 + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}", 9191 } 9192 9193 # Prefix 9194 prefix = self.get_explode_infos_prefix() 9195 9196 # Field 9197 vaf_stats_infos = prefix + vaf_stats_tag 9198 9199 # Variants table 9200 table_variants = self.get_table_variants() 9201 9202 # Header 9203 vcf_reader = self.get_header() 9204 9205 # Create variant id 9206 variant_id_column = self.get_variant_id_column() 9207 added_columns = [variant_id_column] 9208 9209 # variant_id, FORMAT and samples 9210 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9211 self.get_header_sample_list() 9212 ) 9213 9214 # Create dataframe 9215 dataframe_vaf_stats = self.get_query_to_df( 9216 f""" SELECT {samples_fields} FROM {table_variants} """ 9217 ) 9218 9219 # Create vaf_stats column 9220 dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply( 9221 lambda row: genotype_stats( 9222 row, samples=self.get_header_sample_list(), info=info 9223 ), 9224 axis=1, 9225 ) 9226 9227 # List of vcf tags 9228 sql_vaf_stats_fields = [] 9229 9230 # Check all VAF stats infos 9231 for stat in vcf_infos_tags: 9232 9233 # Extract stats 9234 dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply( 9235 lambda x: dict(x).get(stat, "") 9236 ) 9237 9238 # Add snpeff_hgvs to header 9239 vcf_reader.infos[stat] = vcf.parser._Info( 9240 stat, 9241 ".", 9242 "String", 9243 vcf_infos_tags.get(stat, "genotype statistics"), 9244 "howard calculation", 9245 "0", 9246 self.code_type_map.get("String"), 9247 ) 9248 9249 if len(sql_vaf_stats_fields): 9250 sep = ";" 9251 else: 9252 sep = "" 9253 9254 # Create fields to add in INFO 9255 sql_vaf_stats_fields.append( 9256 f""" 9257 CASE 9258 WHEN dataframe_vaf_stats."{stat}" NOT NULL 9259 THEN concat( 9260 '{sep}{stat}=', 9261 dataframe_vaf_stats."{stat}" 9262 ) 9263 ELSE '' 9264 END 9265 """ 9266 ) 9267 9268 # SQL set for update 9269 sql_vaf_stats_fields_set = ", ".join(sql_vaf_stats_fields) 9270 9271 # Update 9272 sql_update = f""" 9273 UPDATE {table_variants} 9274 SET "INFO" = 9275 concat( 9276 CASE 9277 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9278 THEN '' 9279 ELSE concat("INFO", ';') 9280 END, 9281 {sql_vaf_stats_fields_set} 9282 ) 9283 FROM dataframe_vaf_stats 9284 WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}" 9285 9286 """ 9287 self.conn.execute(sql_update) 9288 9289 # Remove added columns 9290 for added_column in added_columns: 9291 self.drop_column(column=added_column) 9292 9293 # Delete dataframe 9294 del dataframe_vaf_stats 9295 gc.collect()
The calculation_genotype_stats function calculates genotype statistics for a given information
field in a VCF file and updates the INFO column of the variants table with the calculated
statistics.
Parameters
- info: The
infoparameter is a string that represents the type of information for which genotype statistics are calculated. It is used to generate various VCF info tags for the statistics, such as the number of occurrences, the list of values, the minimum value, the maximum value, the mean, the median, defaults to VAF
9297 def calculation_transcripts_json(self, info: str = "transcripts_json") -> None: 9298 """ 9299 The function `calculation_transcripts_json` creates a transcripts table and adds an info field 9300 to it if transcripts are available. 9301 9302 :param info: The `info` parameter in the `calculation_transcripts_json` method is a string 9303 parameter that specifies the information field to be used in the transcripts JSON. It has a 9304 default value of "transcripts_json" if no value is provided when calling the method, defaults to 9305 transcripts_json 9306 :type info: str (optional) 9307 """ 9308 9309 # Create transcripts table 9310 transcripts_table = self.create_transcript_view() 9311 9312 # Add info field 9313 if transcripts_table: 9314 self.transcript_view_to_variants( 9315 transcripts_table=transcripts_table, transcripts_info_field=info 9316 ) 9317 else: 9318 log.info("No Transcripts to process. Check param.json file configuration")
The function calculation_transcripts_json creates a transcripts table and adds an info field
to it if transcripts are available.
Parameters
- info: The
infoparameter in thecalculation_transcripts_jsonmethod is a string parameter that specifies the information field to be used in the transcripts JSON. It has a default value of "transcripts_json" if no value is provided when calling the method, defaults to transcripts_json
9320 def calculation_transcripts_prioritization(self) -> None: 9321 """ 9322 The function `calculation_transcripts_prioritization` creates a transcripts table and 9323 prioritizes transcripts based on certain criteria. 9324 """ 9325 9326 # Create transcripts table 9327 transcripts_table = self.create_transcript_view() 9328 9329 # Add info field 9330 if transcripts_table: 9331 self.transcripts_prioritization(transcripts_table=transcripts_table) 9332 else: 9333 log.info("No Transcripts to process. Check param.json file configuration")
The function calculation_transcripts_prioritization creates a transcripts table and
prioritizes transcripts based on certain criteria.
9339 def transcripts_prioritization( 9340 self, transcripts_table: str = None, param: dict = {} 9341 ) -> bool: 9342 """ 9343 The `transcripts_prioritization` function prioritizes transcripts based on certain parameters 9344 and updates the variants table with the prioritized information. 9345 9346 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 9347 of the table containing transcripts data. If no value is provided, it defaults to "transcripts". 9348 This parameter is used to identify the table where the transcripts data is stored for the 9349 prioritization process 9350 :type transcripts_table: str 9351 :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary 9352 that contains various configuration settings for the prioritization process of transcripts. It 9353 is used to customize the behavior of the prioritization algorithm and includes settings such as 9354 the prefix for prioritization fields, default profiles, and other 9355 :type param: dict 9356 :return: The function `transcripts_prioritization` returns a boolean value `True` if the 9357 transcripts prioritization process is successfully completed, and `False` if there are any 9358 issues or if no profile is defined for transcripts prioritization. 9359 """ 9360 9361 log.debug("Start transcripts prioritization...") 9362 9363 # Param 9364 if not param: 9365 param = self.get_param() 9366 9367 # Variants table 9368 table_variants = self.get_table_variants() 9369 log.debug(f"transcripts_table={transcripts_table}") 9370 # Transcripts table 9371 if transcripts_table is None: 9372 log.debug(f"transcripts_table={transcripts_table}") 9373 transcripts_table = self.create_transcript_view( 9374 transcripts_table="transcripts", param=param 9375 ) 9376 log.debug(f"transcripts_table={transcripts_table}") 9377 if transcripts_table is None: 9378 msg_err = "No Transcripts table availalble" 9379 log.error(msg_err) 9380 raise ValueError(msg_err) 9381 9382 # Get transcripts columns 9383 columns_as_list_query = f""" 9384 DESCRIBE {transcripts_table} 9385 """ 9386 columns_as_list = list( 9387 self.get_query_to_df(columns_as_list_query)["column_name"] 9388 ) 9389 9390 # Create INFO if not exists 9391 if "INFO" not in columns_as_list: 9392 query_add_info = f""" 9393 ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT ''; 9394 """ 9395 self.execute_query(query_add_info) 9396 9397 # Prioritization param and Force only PZ Score and Flag 9398 pz_param = param.get("transcripts", {}).get("prioritization", {}) 9399 pz_fields_score = pz_param.get("pzprefix", "PTZ") + "Score" 9400 pz_fields_flag = pz_param.get("pzprefix", "PTZ") + "Flag" 9401 pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript" 9402 pz_param["pzfields"] = [pz_fields_score, pz_fields_flag] 9403 pz_profile_default = ( 9404 param.get("transcripts", {}).get("prioritization", {}).get("profiles", None) 9405 ) 9406 9407 # Exit if no profile 9408 if pz_profile_default is None: 9409 log.warning("No profile defined for transcripts prioritization") 9410 return False 9411 9412 # Prioritization 9413 prioritization_result = self.prioritization( 9414 table=transcripts_table, 9415 pz_param=param.get("transcripts", {}).get("prioritization", {}), 9416 ) 9417 if not prioritization_result: 9418 log.warning("Transcripts prioritization not processed") 9419 return False 9420 9421 # Explode PZ fields 9422 self.explode_infos( 9423 table=transcripts_table, 9424 fields=param.get("transcripts", {}) 9425 .get("prioritization", {}) 9426 .get("pzfields", []), 9427 ) 9428 9429 # Export Transcripts prioritization infos to variants table 9430 query_update = f""" 9431 WITH RankedTranscripts AS ( 9432 SELECT 9433 "#CHROM", POS, REF, ALT, transcript, {pz_fields_score}, {pz_fields_flag}, 9434 ROW_NUMBER() OVER ( 9435 PARTITION BY "#CHROM", POS, REF, ALT 9436 ORDER BY {pz_fields_flag} ASC, {pz_fields_score} DESC, transcript ASC 9437 ) AS rn 9438 FROM 9439 {transcripts_table} 9440 ) 9441 UPDATE {table_variants} 9442 SET 9443 INFO = CONCAT(CASE 9444 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9445 THEN '' 9446 ELSE concat("INFO", ';') 9447 END, 9448 concat('{pz_fields_transcripts}=', transcript, ';{pz_fields_score}=', {pz_fields_score}, ';{pz_fields_flag}=', {pz_fields_flag}) 9449 ) 9450 FROM 9451 RankedTranscripts 9452 WHERE 9453 rn = 1 9454 AND variants."#CHROM" = RankedTranscripts."#CHROM" 9455 AND variants."POS" = RankedTranscripts."POS" 9456 AND variants."REF" = RankedTranscripts."REF" 9457 AND variants."ALT" = RankedTranscripts."ALT" 9458 9459 """ 9460 self.execute_query(query=query_update) 9461 9462 # Add PZ Transcript in header 9463 self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info( 9464 pz_fields_transcripts, 9465 ".", 9466 "String", 9467 f"Transcript selected from transcripts prioritization process, profile {pz_profile_default}", 9468 "unknown", 9469 "unknown", 9470 code_type_map["String"], 9471 ) 9472 9473 # Return 9474 return True
The transcripts_prioritization function prioritizes transcripts based on certain parameters
and updates the variants table with the prioritized information.
Parameters
- transcripts_table: The
transcripts_tableparameter is a string that specifies the name of the table containing transcripts data. If no value is provided, it defaults to "transcripts". This parameter is used to identify the table where the transcripts data is stored for the prioritization process - param: The
paramparameter in thetranscripts_prioritizationmethod is a dictionary that contains various configuration settings for the prioritization process of transcripts. It is used to customize the behavior of the prioritization algorithm and includes settings such as the prefix for prioritization fields, default profiles, and other
Returns
The function
transcripts_prioritizationreturns a boolean valueTrueif the transcripts prioritization process is successfully completed, andFalseif there are any issues or if no profile is defined for transcripts prioritization.
9476 def create_transcript_view_from_columns_map( 9477 self, 9478 transcripts_table: str = "transcripts", 9479 columns_maps: dict = {}, 9480 added_columns: list = [], 9481 temporary_tables: list = None, 9482 annotation_fields: list = None, 9483 ) -> tuple[list, list, list]: 9484 """ 9485 The `create_transcript_view_from_columns_map` function generates a temporary table view based on 9486 specified columns mapping for transcripts data. 9487 9488 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of 9489 the table where the transcripts data is stored or will be stored in the database. This table 9490 typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores, 9491 predictions, etc. It defaults to "transcripts, defaults to transcripts 9492 :type transcripts_table: str (optional) 9493 :param columns_maps: The `columns_maps` parameter is a dictionary that contains information about 9494 how to map columns from a transcripts table to create a view. Each entry in the `columns_maps` list 9495 represents a mapping configuration for a specific set of columns. It typically includes details such 9496 as the main transcript column and additional information columns 9497 :type columns_maps: dict 9498 :param added_columns: The `added_columns` parameter in the `create_transcript_view_from_columns_map` 9499 function is a list that stores the additional columns that will be added to the view being created 9500 based on the columns map provided. These columns are generated by exploding the transcript 9501 information columns along with the main transcript column 9502 :type added_columns: list 9503 :param temporary_tables: The `temporary_tables` parameter in the 9504 `create_transcript_view_from_columns_map` function is a list that stores the names of temporary 9505 tables created during the process of creating a transcript view from a columns map. These temporary 9506 tables are used to store intermediate results or transformations before the final view is generated 9507 :type temporary_tables: list 9508 :param annotation_fields: The `annotation_fields` parameter in the 9509 `create_transcript_view_from_columns_map` function is a list that stores the fields that are used 9510 for annotation in the query view creation process. These fields are extracted from the 9511 `transcripts_column` and `transcripts_infos_columns` specified in the `columns 9512 :type annotation_fields: list 9513 :return: The function `create_transcript_view_from_columns_map` returns a tuple containing three 9514 lists: `added_columns`, `temporary_tables`, and `annotation_fields`. 9515 """ 9516 9517 log.debug("Start transcrpts view creation from columns map...") 9518 9519 # "from_columns_map": [ 9520 # { 9521 # "transcripts_column": "Ensembl_transcriptid", 9522 # "transcripts_infos_columns": [ 9523 # "genename", 9524 # "Ensembl_geneid", 9525 # "LIST_S2_score", 9526 # "LIST_S2_pred", 9527 # ], 9528 # }, 9529 # { 9530 # "transcripts_column": "Ensembl_transcriptid", 9531 # "transcripts_infos_columns": [ 9532 # "genename", 9533 # "VARITY_R_score", 9534 # "Aloft_pred", 9535 # ], 9536 # }, 9537 # ], 9538 9539 # Init 9540 if temporary_tables is None: 9541 temporary_tables = [] 9542 if annotation_fields is None: 9543 annotation_fields = [] 9544 9545 # Variants table 9546 table_variants = self.get_table_variants() 9547 9548 for columns_map in columns_maps: 9549 9550 # Transcript column 9551 transcripts_column = columns_map.get("transcripts_column", None) 9552 9553 # Transcripts infos columns 9554 transcripts_infos_columns = columns_map.get("transcripts_infos_columns", []) 9555 9556 if transcripts_column is not None: 9557 9558 # Explode 9559 added_columns += self.explode_infos( 9560 fields=[transcripts_column] + transcripts_infos_columns 9561 ) 9562 9563 # View clauses 9564 clause_select = [] 9565 for field in [transcripts_column] + transcripts_infos_columns: 9566 clause_select.append( 9567 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 9568 ) 9569 if field not in [transcripts_column]: 9570 annotation_fields.append(field) 9571 9572 # Querey View 9573 query = f""" 9574 SELECT 9575 "#CHROM", POS, REF, ALT, 9576 "{transcripts_column}" AS 'transcript', 9577 {", ".join(clause_select)} 9578 FROM ( 9579 SELECT 9580 "#CHROM", POS, REF, ALT, 9581 {", ".join(clause_select)} 9582 FROM {table_variants} 9583 ) 9584 WHERE "{transcripts_column}" IS NOT NULL 9585 """ 9586 9587 # Create temporary table 9588 temporary_table = transcripts_table + "".join( 9589 random.choices(string.ascii_uppercase + string.digits, k=10) 9590 ) 9591 9592 # Temporary_tables 9593 temporary_tables.append(temporary_table) 9594 query_view = f""" 9595 CREATE TEMPORARY TABLE {temporary_table} 9596 AS ({query}) 9597 """ 9598 self.execute_query(query=query_view) 9599 9600 return added_columns, temporary_tables, annotation_fields
The create_transcript_view_from_columns_map function generates a temporary table view based on
specified columns mapping for transcripts data.
Parameters
- transcripts_table: The
transcripts_tableparameter is a string that specifies the name of the table where the transcripts data is stored or will be stored in the database. This table typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores, predictions, etc. It defaults to "transcripts, defaults to transcripts - columns_maps: The
columns_mapsparameter is a dictionary that contains information about how to map columns from a transcripts table to create a view. Each entry in thecolumns_mapslist represents a mapping configuration for a specific set of columns. It typically includes details such as the main transcript column and additional information columns - added_columns: The
added_columnsparameter in thecreate_transcript_view_from_columns_mapfunction is a list that stores the additional columns that will be added to the view being created based on the columns map provided. These columns are generated by exploding the transcript information columns along with the main transcript column - temporary_tables: The
temporary_tablesparameter in thecreate_transcript_view_from_columns_mapfunction is a list that stores the names of temporary tables created during the process of creating a transcript view from a columns map. These temporary tables are used to store intermediate results or transformations before the final view is generated - annotation_fields: The
annotation_fieldsparameter in thecreate_transcript_view_from_columns_mapfunction is a list that stores the fields that are used for annotation in the query view creation process. These fields are extracted from thetranscripts_columnandtranscripts_infos_columnsspecified in the `columns
Returns
The function
create_transcript_view_from_columns_mapreturns a tuple containing three lists:added_columns,temporary_tables, andannotation_fields.
9602 def create_transcript_view_from_column_format( 9603 self, 9604 transcripts_table: str = "transcripts", 9605 column_formats: dict = {}, 9606 temporary_tables: list = None, 9607 annotation_fields: list = None, 9608 ) -> tuple[list, list, list]: 9609 """ 9610 The `create_transcript_view_from_column_format` function generates a transcript view based on 9611 specified column formats, adds additional columns and annotation fields, and returns the list of 9612 temporary tables and annotation fields. 9613 9614 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of 9615 the table containing the transcripts data. This table will be used as the base table for creating 9616 the transcript view. The default value for this parameter is "transcripts", but you can provide a 9617 different table name if needed, defaults to transcripts 9618 :type transcripts_table: str (optional) 9619 :param column_formats: The `column_formats` parameter is a dictionary that contains information 9620 about the columns to be used for creating the transcript view. Each entry in the dictionary 9621 specifies the mapping between a transcripts column and a transcripts infos column. For example, in 9622 the provided code snippet: 9623 :type column_formats: dict 9624 :param temporary_tables: The `temporary_tables` parameter in the 9625 `create_transcript_view_from_column_format` function is a list that stores the names of temporary 9626 views created during the process of creating a transcript view from a column format. These temporary 9627 views are used to manipulate and extract data before generating the final transcript view. It 9628 :type temporary_tables: list 9629 :param annotation_fields: The `annotation_fields` parameter in the 9630 `create_transcript_view_from_column_format` function is a list that stores the annotation fields 9631 that are extracted from the temporary views created during the process. These annotation fields are 9632 obtained by querying the temporary views and extracting the column names excluding specific columns 9633 like `#CH 9634 :type annotation_fields: list 9635 :return: The `create_transcript_view_from_column_format` function returns two lists: 9636 `temporary_tables` and `annotation_fields`. 9637 """ 9638 9639 log.debug("Start transcrpts view creation from column format...") 9640 9641 # "from_column_format": [ 9642 # { 9643 # "transcripts_column": "ANN", 9644 # "transcripts_infos_column": "Feature_ID", 9645 # } 9646 # ], 9647 9648 # Init 9649 if temporary_tables is None: 9650 temporary_tables = [] 9651 if annotation_fields is None: 9652 annotation_fields = [] 9653 9654 for column_format in column_formats: 9655 9656 # annotation field and transcript annotation field 9657 annotation_field = column_format.get("transcripts_column", "ANN") 9658 transcript_annotation = column_format.get( 9659 "transcripts_infos_column", "Feature_ID" 9660 ) 9661 9662 # Temporary View name 9663 temporary_view_name = transcripts_table + "".join( 9664 random.choices(string.ascii_uppercase + string.digits, k=10) 9665 ) 9666 9667 # Create temporary view name 9668 temporary_view_name = self.annotation_format_to_table( 9669 uniquify=True, 9670 annotation_field=annotation_field, 9671 view_name=temporary_view_name, 9672 annotation_id=transcript_annotation, 9673 ) 9674 9675 # Annotation fields 9676 if temporary_view_name: 9677 query_annotation_fields = f""" 9678 SELECT * 9679 FROM ( 9680 DESCRIBE SELECT * 9681 FROM {temporary_view_name} 9682 ) 9683 WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT') 9684 """ 9685 df_annotation_fields = self.get_query_to_df( 9686 query=query_annotation_fields 9687 ) 9688 9689 # Add temporary view and annotation fields 9690 temporary_tables.append(temporary_view_name) 9691 annotation_fields += list(set(df_annotation_fields["column_name"])) 9692 9693 return temporary_tables, annotation_fields
The create_transcript_view_from_column_format function generates a transcript view based on
specified column formats, adds additional columns and annotation fields, and returns the list of
temporary tables and annotation fields.
Parameters
- transcripts_table: The
transcripts_tableparameter is a string that specifies the name of the table containing the transcripts data. This table will be used as the base table for creating the transcript view. The default value for this parameter is "transcripts", but you can provide a different table name if needed, defaults to transcripts - column_formats: The
column_formatsparameter is a dictionary that contains information about the columns to be used for creating the transcript view. Each entry in the dictionary specifies the mapping between a transcripts column and a transcripts infos column. For example, in the provided code snippet: - temporary_tables: The
temporary_tablesparameter in thecreate_transcript_view_from_column_formatfunction is a list that stores the names of temporary views created during the process of creating a transcript view from a column format. These temporary views are used to manipulate and extract data before generating the final transcript view. It - annotation_fields: The
annotation_fieldsparameter in thecreate_transcript_view_from_column_formatfunction is a list that stores the annotation fields that are extracted from the temporary views created during the process. These annotation fields are obtained by querying the temporary views and extracting the column names excluding specific columns like `#CH
Returns
The
create_transcript_view_from_column_formatfunction returns two lists:temporary_tablesandannotation_fields.
9695 def create_transcript_view( 9696 self, 9697 transcripts_table: str = None, 9698 transcripts_table_drop: bool = True, 9699 param: dict = {}, 9700 ) -> str: 9701 """ 9702 The `create_transcript_view` function generates a transcript view by processing data from a 9703 specified table based on provided parameters and structural information. 9704 9705 :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function 9706 is used to specify the name of the table that will store the final transcript view data. If a table 9707 name is not provided, the function will create a new table to store the transcript view data, and by 9708 default,, defaults to transcripts 9709 :type transcripts_table: str (optional) 9710 :param transcripts_table_drop: The `transcripts_table_drop` parameter in the 9711 `create_transcript_view` function is a boolean parameter that determines whether to drop the 9712 existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`, 9713 the function will drop the existing transcripts table if it exists, defaults to True 9714 :type transcripts_table_drop: bool (optional) 9715 :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that 9716 contains information needed to create a transcript view. It includes details such as the structure 9717 of the transcripts, columns mapping, column formats, and other necessary information for generating 9718 the view. This parameter allows for flexibility and customization 9719 :type param: dict 9720 :return: The `create_transcript_view` function returns the name of the transcripts table that was 9721 created or modified during the execution of the function. 9722 """ 9723 9724 log.debug("Start transcripts view creation...") 9725 9726 # Default 9727 transcripts_table_default = "transcripts" 9728 9729 # Param 9730 if not param: 9731 param = self.get_param() 9732 9733 # Struct 9734 struct = param.get("transcripts", {}).get("struct", None) 9735 9736 if struct: 9737 9738 # Transcripts table 9739 if transcripts_table is None: 9740 transcripts_table = param.get("transcripts", {}).get( 9741 "table", transcripts_table_default 9742 ) 9743 9744 # added_columns 9745 added_columns = [] 9746 9747 # Temporary tables 9748 temporary_tables = [] 9749 9750 # Annotation fields 9751 annotation_fields = [] 9752 9753 # from columns map 9754 columns_maps = struct.get("from_columns_map", []) 9755 added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = ( 9756 self.create_transcript_view_from_columns_map( 9757 transcripts_table=transcripts_table, 9758 columns_maps=columns_maps, 9759 added_columns=added_columns, 9760 temporary_tables=temporary_tables, 9761 annotation_fields=annotation_fields, 9762 ) 9763 ) 9764 added_columns += added_columns_tmp 9765 temporary_tables += temporary_tables_tmp 9766 annotation_fields += annotation_fields_tmp 9767 9768 # from column format 9769 column_formats = struct.get("from_column_format", []) 9770 temporary_tables_tmp, annotation_fields_tmp = ( 9771 self.create_transcript_view_from_column_format( 9772 transcripts_table=transcripts_table, 9773 column_formats=column_formats, 9774 temporary_tables=temporary_tables, 9775 annotation_fields=annotation_fields, 9776 ) 9777 ) 9778 temporary_tables += temporary_tables_tmp 9779 annotation_fields += annotation_fields_tmp 9780 9781 # Merge temporary tables query 9782 query_merge = "" 9783 for temporary_table in temporary_tables: 9784 9785 # First temporary table 9786 if not query_merge: 9787 query_merge = f""" 9788 SELECT * FROM {temporary_table} 9789 """ 9790 # other temporary table (using UNION) 9791 else: 9792 query_merge += f""" 9793 UNION BY NAME SELECT * FROM {temporary_table} 9794 """ 9795 9796 # Merge on transcript 9797 query_merge_on_transcripts_annotation_fields = [] 9798 # Aggregate all annotations fields 9799 for annotation_field in set(annotation_fields): 9800 query_merge_on_transcripts_annotation_fields.append( 9801 f""" list_aggregate(list_distinct(array_agg({annotation_field})), 'string_agg', ',') AS {annotation_field} """ 9802 ) 9803 # Query for transcripts view 9804 query_merge_on_transcripts = f""" 9805 SELECT "#CHROM", POS, REF, ALT, transcript, {", ".join(query_merge_on_transcripts_annotation_fields)} 9806 FROM ({query_merge}) 9807 GROUP BY "#CHROM", POS, REF, ALT, transcript 9808 """ 9809 9810 # Drop transcript view is necessary 9811 if transcripts_table_drop: 9812 query_drop = f""" 9813 DROP TABLE IF EXISTS {transcripts_table}; 9814 """ 9815 self.execute_query(query=query_drop) 9816 9817 # Merge and create transcript view 9818 query_create_view = f""" 9819 CREATE TABLE IF NOT EXISTS {transcripts_table} 9820 AS {query_merge_on_transcripts} 9821 """ 9822 self.execute_query(query=query_create_view) 9823 9824 # Remove added columns 9825 for added_column in added_columns: 9826 self.drop_column(column=added_column) 9827 9828 else: 9829 9830 transcripts_table = None 9831 9832 return transcripts_table
The create_transcript_view function generates a transcript view by processing data from a
specified table based on provided parameters and structural information.
Parameters
- transcripts_table: The
transcripts_tableparameter in thecreate_transcript_viewfunction is used to specify the name of the table that will store the final transcript view data. If a table name is not provided, the function will create a new table to store the transcript view data, and by default,, defaults to transcripts - transcripts_table_drop: The
transcripts_table_dropparameter in thecreate_transcript_viewfunction is a boolean parameter that determines whether to drop the existing transcripts table before creating a new one. Iftranscripts_table_dropis set toTrue, the function will drop the existing transcripts table if it exists, defaults to True - param: The
paramparameter in thecreate_transcript_viewfunction is a dictionary that contains information needed to create a transcript view. It includes details such as the structure of the transcripts, columns mapping, column formats, and other necessary information for generating the view. This parameter allows for flexibility and customization
Returns
The
create_transcript_viewfunction returns the name of the transcripts table that was created or modified during the execution of the function.
9834 def annotation_format_to_table( 9835 self, 9836 uniquify: bool = True, 9837 annotation_field: str = "ANN", 9838 annotation_id: str = "Feature_ID", 9839 view_name: str = "transcripts", 9840 ) -> str: 9841 """ 9842 The function `annotation_format_to_table` converts annotation data from a VCF file into a structured 9843 table format. 9844 9845 :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure unique 9846 values in the output or not. If set to `True`, the function will make sure that the output values 9847 are unique, defaults to True 9848 :type uniquify: bool (optional) 9849 :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file that 9850 contains the annotation information for each variant. This field is used to extract the annotation 9851 details for further processing in the function, defaults to ANN 9852 :type annotation_field: str (optional) 9853 :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method is 9854 used to specify the identifier for the annotation feature. This identifier will be used as a column 9855 name in the resulting table or view that is created based on the annotation data. It helps in 9856 uniquely identifying each annotation entry in the, defaults to Feature_ID 9857 :type annotation_id: str (optional) 9858 :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used to 9859 specify the name of the temporary table that will be created to store the transformed annotation 9860 data. This table will hold the extracted information from the annotation field in a structured 9861 format for further processing or analysis, defaults to transcripts 9862 :type view_name: str (optional) 9863 :return: The function `annotation_format_to_table` is returning the name of the view created, which 9864 is stored in the variable `view_name`. 9865 """ 9866 9867 # Annotation field 9868 annotation_format = "annotation_explode" 9869 9870 # Transcript annotation 9871 annotation_id = "".join(char for char in annotation_id if char.isalnum()) 9872 9873 # Prefix 9874 prefix = self.get_explode_infos_prefix() 9875 if prefix: 9876 prefix = "INFO/" 9877 9878 # Annotation fields 9879 annotation_infos = prefix + annotation_field 9880 annotation_format_infos = prefix + annotation_format 9881 9882 # Variants table 9883 table_variants = self.get_table_variants() 9884 9885 # Header 9886 vcf_reader = self.get_header() 9887 9888 # Add columns 9889 added_columns = [] 9890 9891 # Explode HGVS field in column 9892 added_columns += self.explode_infos(fields=[annotation_field]) 9893 9894 if annotation_field in vcf_reader.infos: 9895 9896 # Extract ANN header 9897 ann_description = vcf_reader.infos[annotation_field].desc 9898 pattern = r"'(.+?)'" 9899 match = re.search(pattern, ann_description) 9900 if match: 9901 ann_header_match = match.group(1).split(" | ") 9902 ann_header = [] 9903 ann_header_desc = {} 9904 for i in range(len(ann_header_match)): 9905 ann_header_info = "".join( 9906 char for char in ann_header_match[i] if char.isalnum() 9907 ) 9908 ann_header.append(ann_header_info) 9909 ann_header_desc[ann_header_info] = ann_header_match[i] 9910 if not ann_header_desc: 9911 raise ValueError("Invalid header description format") 9912 else: 9913 raise ValueError("Invalid header description format") 9914 9915 # Create variant id 9916 variant_id_column = self.get_variant_id_column() 9917 added_columns += [variant_id_column] 9918 9919 # Create dataframe 9920 dataframe_annotation_format = self.get_query_to_df( 9921 f""" SELECT "#CHROM", POS, REF, ALT, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """ 9922 ) 9923 9924 # Create annotation columns 9925 dataframe_annotation_format[ 9926 annotation_format_infos 9927 ] = dataframe_annotation_format[annotation_infos].apply( 9928 lambda x: explode_annotation_format( 9929 annotation=str(x), 9930 uniquify=uniquify, 9931 output_format="JSON", 9932 prefix="", 9933 header=list(ann_header_desc.values()), 9934 ) 9935 ) 9936 9937 # Find keys 9938 query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;""" 9939 df_keys = self.get_query_to_df(query=query_json) 9940 9941 # Check keys 9942 query_json_key = [] 9943 for _, row in df_keys.iterrows(): 9944 9945 # Key 9946 key = row.iloc[0] 9947 9948 # key_clean 9949 key_clean = "".join(char for char in key if char.isalnum()) 9950 9951 # Type 9952 query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');""" 9953 9954 # Get DataFrame from query 9955 df_json_type = self.get_query_to_df(query=query_json_type) 9956 9957 # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN 9958 with pd.option_context("future.no_silent_downcasting", True): 9959 df_json_type.fillna(value="", inplace=True) 9960 replace_dict = {None: np.nan, "": np.nan} 9961 df_json_type.replace(replace_dict, inplace=True) 9962 df_json_type.dropna(inplace=True) 9963 9964 # Detect column type 9965 column_type = detect_column_type(df_json_type[key_clean]) 9966 9967 # Append 9968 query_json_key.append( 9969 f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type} AS '{prefix}{key_clean}' """ 9970 ) 9971 9972 # Create view 9973 query_view = f"""CREATE TEMPORARY TABLE {view_name} AS (SELECT *, {annotation_id} AS 'transcript' FROM (SELECT "#CHROM", POS, REF, ALT, {",".join(query_json_key)} FROM dataframe_annotation_format));""" 9974 self.execute_query(query=query_view) 9975 9976 else: 9977 9978 # Return None 9979 view_name = None 9980 9981 # Remove added columns 9982 for added_column in added_columns: 9983 self.drop_column(column=added_column) 9984 9985 return view_name
The function annotation_format_to_table converts annotation data from a VCF file into a structured
table format.
Parameters
- uniquify: The
uniquifyparameter is a boolean flag that determines whether to ensure unique values in the output or not. If set toTrue, the function will make sure that the output values are unique, defaults to True - annotation_field: The
annotation_fieldparameter refers to the field in the VCF file that contains the annotation information for each variant. This field is used to extract the annotation details for further processing in the function, defaults to ANN - annotation_id: The
annotation_idparameter in theannotation_format_to_tablemethod is used to specify the identifier for the annotation feature. This identifier will be used as a column name in the resulting table or view that is created based on the annotation data. It helps in uniquely identifying each annotation entry in the, defaults to Feature_ID - view_name: The
view_nameparameter in theannotation_format_to_tablemethod is used to specify the name of the temporary table that will be created to store the transformed annotation data. This table will hold the extracted information from the annotation field in a structured format for further processing or analysis, defaults to transcripts
Returns
The function
annotation_format_to_tableis returning the name of the view created, which is stored in the variableview_name.
9987 def transcript_view_to_variants( 9988 self, 9989 transcripts_table: str = None, 9990 transcripts_column_id: str = None, 9991 transcripts_info_json: str = None, 9992 transcripts_info_field: str = None, 9993 param: dict = {}, 9994 ) -> bool: 9995 """ 9996 The function `transcript_view_to_variants` takes input parameters related to transcripts and updates 9997 a variants table with information from the transcripts in JSON format. 9998 9999 :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the table 10000 containing the transcripts data. If this parameter is not provided, the function will attempt to 10001 retrieve it from the `param` dictionary or use a default value of "transcripts" 10002 :type transcripts_table: str 10003 :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the column in 10004 the `transcripts_table` that contains the unique identifier for each transcript. This identifier is 10005 used to match transcripts with variants in the database 10006 :type transcripts_column_id: str 10007 :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name of 10008 the column in the variants table where the transcripts information will be stored in JSON format 10009 :type transcripts_info_json: str 10010 :param transcripts_info_field: The `transcripts_info_field` parameter is used to specify the field 10011 in the VCF header that will contain information about transcripts in JSON format. This field will be 10012 added to the VCF header as an INFO field with the specified name 10013 :type transcripts_info_field: str 10014 :param param: The `transcript_view_to_variants` method takes several parameters: 10015 :type param: dict 10016 :return: The function `transcript_view_to_variants` returns a boolean value, which is `True` if the 10017 operation is successful and `False` if certain conditions are not met. 10018 """ 10019 10020 log.debug("Start transcripts view to JSON...") 10021 10022 # Default 10023 transcripts_table_default = "transcripts" 10024 transcripts_column_id_default = "transcript" 10025 transcripts_info_json_default = None 10026 transcripts_info_field_default = None 10027 10028 # Param 10029 if not param: 10030 param = self.get_param() 10031 10032 # Transcripts table 10033 if transcripts_table is None: 10034 transcripts_table = param.get("transcripts", {}).get( 10035 "table", transcripts_table_default 10036 ) 10037 10038 # Transcripts column ID 10039 if transcripts_column_id is None: 10040 transcripts_column_id = param.get("transcripts", {}).get( 10041 "column_id", transcripts_column_id_default 10042 ) 10043 10044 # Transcripts info field 10045 if transcripts_info_json is None: 10046 transcripts_info_json = param.get("transcripts", {}).get( 10047 "transcripts_info_json", transcripts_info_json_default 10048 ) 10049 10050 # Transcripts info field 10051 if transcripts_info_field is None: 10052 transcripts_info_field = param.get("transcripts", {}).get( 10053 "transcripts_info_field", transcripts_info_field_default 10054 ) 10055 10056 # Variants table 10057 table_variants = self.get_table_variants() 10058 10059 # Check info columns param 10060 if transcripts_info_json is None and transcripts_info_field is None: 10061 return False 10062 10063 # Transcripts infos columns 10064 query_transcripts_infos_columns = f""" 10065 SELECT * 10066 FROM ( 10067 DESCRIBE SELECT * FROM {transcripts_table} 10068 ) 10069 WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}') 10070 """ 10071 transcripts_infos_columns = list( 10072 self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"] 10073 ) 10074 10075 # View results 10076 clause_select = [] 10077 clause_to_json = [] 10078 for field in transcripts_infos_columns: 10079 clause_select.append( 10080 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10081 ) 10082 clause_to_json.append(f""" '{field}': "{field}" """) 10083 10084 # Update 10085 update_set = [] 10086 10087 # VCF header 10088 vcf_reader = self.get_header() 10089 10090 # Transcripts to info column in JSON 10091 if transcripts_info_json is not None: 10092 10093 # Create column on variants table 10094 self.add_column( 10095 table_name=table_variants, 10096 column_name=transcripts_info_json, 10097 column_type="JSON", 10098 default_value=None, 10099 drop=False, 10100 ) 10101 10102 # Add to update 10103 update_set.append( 10104 f""" {transcripts_info_json}=t.{transcripts_info_json} """ 10105 ) 10106 10107 # Add header 10108 vcf_reader.infos[transcripts_info_json] = vcf.parser._Info( 10109 transcripts_info_json, 10110 ".", 10111 "String", 10112 "Transcripts in JSON format", 10113 "unknwon", 10114 "unknwon", 10115 self.code_type_map["String"], 10116 ) 10117 10118 # Transcripts to info field in JSON 10119 if transcripts_info_field is not None: 10120 10121 # Add to update 10122 update_set.append( 10123 f""" 10124 INFO = concat( 10125 CASE 10126 WHEN INFO NOT IN ('', '.') 10127 THEN INFO 10128 ELSE '' 10129 END, 10130 CASE 10131 WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.') 10132 THEN concat( 10133 ';{transcripts_info_field}=', 10134 t.{transcripts_info_json} 10135 ) 10136 ELSE '' 10137 END 10138 ) 10139 """ 10140 ) 10141 10142 # Add header 10143 vcf_reader.infos[transcripts_info_field] = vcf.parser._Info( 10144 transcripts_info_field, 10145 ".", 10146 "String", 10147 "Transcripts in JSON format", 10148 "unknwon", 10149 "unknwon", 10150 self.code_type_map["String"], 10151 ) 10152 10153 # Update query 10154 query_update = f""" 10155 UPDATE {table_variants} 10156 SET {", ".join(update_set)} 10157 FROM 10158 ( 10159 SELECT 10160 "#CHROM", POS, REF, ALT, 10161 concat( 10162 '{{', 10163 string_agg( 10164 '"' || "{transcripts_column_id}" || '":' || 10165 to_json(json_output) 10166 ), 10167 '}}' 10168 )::JSON AS {transcripts_info_json} 10169 FROM 10170 ( 10171 SELECT 10172 "#CHROM", POS, REF, ALT, 10173 "{transcripts_column_id}", 10174 to_json( 10175 {{{",".join(clause_to_json)}}} 10176 )::JSON AS json_output 10177 FROM 10178 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 10179 WHERE "{transcripts_column_id}" IS NOT NULL 10180 ) 10181 GROUP BY "#CHROM", POS, REF, ALT 10182 ) AS t 10183 WHERE {table_variants}."#CHROM" = t."#CHROM" 10184 AND {table_variants}."POS" = t."POS" 10185 AND {table_variants}."REF" = t."REF" 10186 AND {table_variants}."ALT" = t."ALT" 10187 """ 10188 10189 self.execute_query(query=query_update) 10190 10191 return True
The function transcript_view_to_variants takes input parameters related to transcripts and updates
a variants table with information from the transcripts in JSON format.
Parameters
- transcripts_table: The
transcripts_tableparameter is used to specify the name of the table containing the transcripts data. If this parameter is not provided, the function will attempt to retrieve it from theparamdictionary or use a default value of "transcripts" - transcripts_column_id: The
transcripts_column_idparameter is used to specify the column in thetranscripts_tablethat contains the unique identifier for each transcript. This identifier is used to match transcripts with variants in the database - transcripts_info_json: The
transcripts_info_jsonparameter is used to specify the name of the column in the variants table where the transcripts information will be stored in JSON format - transcripts_info_field: The
transcripts_info_fieldparameter is used to specify the field in the VCF header that will contain information about transcripts in JSON format. This field will be added to the VCF header as an INFO field with the specified name - param: The
transcript_view_to_variantsmethod takes several parameters:
Returns
The function
transcript_view_to_variantsreturns a boolean value, which isTrueif the operation is successful andFalseif certain conditions are not met.